def setUp(self):
        filename = datapath("alldata-id-10.txt")
        train_docs = read_sentiment_docs(filename)
        self.train_docs = train_docs
        self.source_doc_vec_file = datapath("small_tag_doc_5_iter50")
        self.target_doc_vec_file = datapath("large_tag_doc_10_iter50")

        self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file)
        self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
示例#2
0
def do_command(args):
    # Load data
    data = load_data(args.input)
    #ids, documents = zip(*data)
    data = [(id, tokenize(doc)) for id, doc in data]
    ids = [id for id, _ in data]

    if not os.path.exists(args.modelfile):
        model = embed_documents(data)
        # Save model
        model.save(args.modelfile)
    else:
        model = Doc2Vec.load(args.modelfile)
        #map(model.infer_tokens, tokenized)
    print("Loaded model.")
    # Do k-nearest neighbors search.

    writer = csv.writer(args.output, delimiter='\t')
    writer.writerow(["id1", "id2", "score"])
    count = int(args.count) if args.count > 0 else len(model.docvecs)
    vectors = np.array([model.docvecs[i] for i in range(count)])
    del model # clear up memory

    for i, j, score in find_nearest_neighbors(vectors):
        id1, id2 = ids[i], ids[j]
        writer.writerow([id1, id2, score])
示例#3
0
    def __init__(self, sentences, name, dataset_name, epochs=1, dimension=50, modelfile=None):
        self.inner_model = None

        # parameters
        self.dataset = dataset_name
        self.sentences = sentences
        self.name = name
        self.epochs = epochs
        self.dimension = dimension

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        if modelfile is not None:
            filename = modelfile
        else:
            filename = "DOC2VEC_%s_%s_%s_%s" % (self.dataset, self.name, self.epochs, self.dimension)
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        # train initial model
        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = Doc2Vec.load(self.filepath)
        else:
            self.inner_model = Doc2Vec(sentences, size=self.dimension)
            print self.inner_model.vocab.keys()
            self.inner_model.save(fname=self.filepath)
示例#4
0
    def __init__(self, size=300, window=8, min_count=2, workers=8, path_to_model=None, stream_train=False):

        '''
        Initializes the Doc2Vec_Wrapper class. 

        Args:
            size (int): Specifies the size of the feature-vector. Defaults to 300
            window (int): Specifies the size of the context window from which the feature vector is learned
            min_count (int): Specifices the minimum number of instances of each word that is saved in the model
            workers (int): number of parallel processes
            path_to_model (str): Specifies model on disk 
            stream_train (bool): If true, update word vectors with new sentences. If false, just get doc vecs
        '''

        self.stream_train=stream_train

        self.is_trained = False
        self.model = None

        ## if a path is passed, try to load from disk. Otherwise, retrain anyway
        if path_to_model:
            try:
                self.is_trained = True
                self.model = Doc2Vec.load(path_to_model)
            except:
                pass

        ## params for Doc2Vec 
        self.size = size ## size of the vector
        self.window = window ## size of the context window
        self.min_count = min_count ## minimum count of vocab to store in binary tree
        self.workers = workers ## number of parallel processes == number of cores on the computer
示例#5
0
def test_category():
    from gensim.models.doc2vec import Doc2Vec
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import ConceptRelatedness
    model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec'))
    cat2vec_rel = ConceptRelatedness(model_category)
    print(cat2vec_rel.word_similarity('happy','sad'))
 def load_external(self, model_file_name):
     """
     load a word2vec model from the file specified
     :param model_file_name: name of the model file
     :return:
     """
     self.model = Doc2Vec.load(model_file_name)
示例#7
0
文件: doc2vec.py 项目: wtgme/ohsn
def varify():
    from gensim.models.doc2vec import Doc2Vec
    model = Doc2Vec.load('data/doc2vec.d2v')
    documents = pickle.load(open('data/fedcorpus.pick', 'r'))
    for i in xrange(3):
        inferred_docvec = model.infer_vector(documents[i].words)
        print documents[i].tags
        print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
def main():
    """
    1. Divide total dataset into several data bins by randomly extracting data entries with given ratio.
    2. Run cross-validation for given numbers of iterations in either SMOTE or non-SMOTE mode.
    3. Report and present statistical evaluations for each data bin.
    """
    stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns = list(), list(), list() # ns for non-SMOTE
    stats_Fscores_ws, stats_recalls_ws, stats_precisions_ws = list(), list(), list() # ws for with SMOTE
    data_pos, data_neg = load_data("../data/")
    data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
    print "Loading Doc2Vec model ..."
    model_doc2vec = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # load Doc2Vec model
    print "Doc2Vec model loading done!"
    models = {"SVC": sklearn.svm.SVC(), \
              "Logit": sklearn.linear_model.LogisticRegression(), \
              "DT": sklearn.tree.DecisionTreeClassifier(), \
              "NBayes": sklearn.naive_bayes.GaussianNB(), \
              "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
    model_chosen = "NBayes"
    print "Classifier Type:", model_chosen
    for binIndex in range(NUM_OF_BINS):
        print "Experiment on DataSet#", str(binIndex)
        random.shuffle(data_pos)
        random.shuffle(data_neg)
        size_pos_bin, size_neg_bin = int(len(data_pos)*SAMPLE_SIZE_RATIO), int(len(data_neg)*SAMPLE_SIZE_RATIO)
        data_pos_bin, data_neg_bin = data_pos[:size_pos_bin], data_neg[:size_neg_bin] # dataset bin
        sFscores_iter_ns, sRecalls_iter_ns, sPrecisions_iter_ns = list(), list(), list()
        sFscores_iter_ws, sRecalls_iter_ws, sPrecisions_iter_ws = list(), list(), list()
        for iteration in range(NUM_OF_ITERATION):
            random.seed(iteration)
            random.shuffle(data_pos_bin)
            random.shuffle(data_neg_bin)
            data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos_bin, data_neg_bin, model_doc2vec) # convert to doc vectors
            print "non-SMOTE experiment"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=False)  # cross validation
            sFscores_iter_ns.extend(Fscores)
            sRecalls_iter_ns.extend(recalls)
            sPrecisions_iter_ns.extend(precisions)
            print "with SMOTE experiemnt"
            accuracys, precisions, recalls, Fscores = cross_validationS( \
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD,
                smote_flag=True)  # cross validation
            sFscores_iter_ws.extend(Fscores)
            sRecalls_iter_ws.extend(recalls)
            sPrecisions_iter_ws.extend(precisions)
        stats_Fscores_ns.append(sFscores_iter_ns)
        stats_precisions_ns.append(sPrecisions_iter_ns)
        stats_recalls_ns.append(sRecalls_iter_ns)
        stats_Fscores_ws.append(sFscores_iter_ws)
        stats_precisions_ws.append(sPrecisions_iter_ws)
        stats_recalls_ws.append(sRecalls_iter_ws)
    print "All Experiments Done!"
    save_stats(stats_Fscores_ns, stats_recalls_ns, stats_precisions_ns, stats_Fscores_ws, stats_recalls_ws,\
               stats_precisions_ws, model_name=model_chosen)
    print "Statistics ready!"
def get_model(model_num, model_names):
    
    
    if model_num < 10:
        model = Word2Vec.load(model_path + model_names)
    elif model_num < 99:
        model = Doc2Vec.load(model_path + model_names)
    else:
        model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True)  # C text format
    return model
示例#10
0
def create_and_train_models_d2vec(tag, cores=6):
    """
    Build vocabulary and train models
    :param tag: small or big 
    :param cores: number of cores
    :return: the current models
    """
    simple_models = get_models_d2vec(cores)
    model_files = get_models_filename_d2vec(tag)
    if all([os.path.exists(file) for file in model_files]):
        print('Models exist, loading...')
        for i, fname in enumerate(model_files):
            simple_models[i] = Doc2Vec.load(fname)
        models_by_name = OrderedDict((str(model), model) for model in simple_models)
        return models_by_name
    else:
        print('Building models...')
        voc_model = build_vocab_d2vec(tag, cores)
        # Share vocabulary between models
        for model in simple_models:
            model.reset_from(voc_model)

        models_by_name = OrderedDict((str(model), model) for model in simple_models)
        print('Training models...')
        print("START %s" % datetime.datetime.now())
        best_error = defaultdict(lambda: 1.0)  # to selectively-print only best errors achieved

        alpha, min_alpha, passes = (0.025, 0.001, 20)
        alpha_delta = (alpha - min_alpha) / passes
        file = x_train_str.format(tag)
        x_train = pd.read_hdf(file)
        train_list = x_train.tolist()

        for epoch in range(passes):
            shuffle(train_list)  # shuffling gets best results

            for name, train_model in models_by_name.items():
                # train
                duration = 'na'
                train_model.alpha, train_model.min_alpha = alpha, alpha
                with elapsed_timer() as elapsed:
                    train_model.train(CorpusStream(train_list, 'train'), total_examples=train_model.corpus_count,
                                      epochs=train_model.iter)
                    duration = '%.1f' % elapsed()

            print('completed pass %i at alpha %f' % (epoch + 1, alpha))
            alpha -= alpha_delta

        print("END %s" % str(datetime.datetime.now()))
        for name, model in models_by_name.items():
            name = name.replace('/', '').replace(',', '_')
            model.save('models/{0}_{1}.m'.format(name, tag))

    return models_by_name
示例#11
0
def get_WordVector_matrix(label):
    model = Doc2Vec.load('./WordVector_model.d2v')
    size = len(label)
    vectors = np.zeros((size,depth))
    for i in range(size):
        try:
            doc_vector = model.docvecs[str(i)]
            vectors[i]=(doc_vector[0])
        except KeyError:
            print str(i) + ' occurs KeyError'
            pass
    return map(list,vectors)
示例#12
0
def build_model(x_train, x_test, iteration =5, save=True):
    if(save):
        big_list = x_train + x_test
        model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=8)
        model.build_vocab(big_list)
	for i in range(iteration):
            model.train(big_list)
	print 'saving model to file.....'  
        model.save('./sentim.d2v')
    else:
	print 'loading model from file.....'
	model = Doc2Vec.load('./sentim.d2v')
    return model
def test_models( FULL_SIM, models_files ):
    test_papers = pd.read_csv( TEST_FILEPATH )

    # NOTE: Only need for testing with AII:
    keywords_docsrels = populate_iks_dict()
    authorities = initialize_authorities()

    for mod_f in models_files:
        print( 'Testing '+ mod_f )
        model = Doc2Vec.load( mod_f )
        print( 'Model loaded.' )

        test_model( FULL_SIM, model, test_papers, keywords_docsrels, authorities )
示例#14
0
def get_vec(vector_file, id_file, w_file):
    p2v = Doc2Vec.load(vector_file)
    fout = open(w_file, "w")
    index = 0
    with open(id_file) as f:
        for line in f:
            index += 1
            if index % 1000 == 0:
                logging("%d cases" % index)
            line = line.strip()
            vec = p2v.docvecs[line]
            line_w = line + "\t" + "\t".join([str(x) for x in vec]) + "\t" + "\n"
            fout.write(line_w)
    fout.close()
示例#15
0
def datacluster(data):
	infered_vectors_list = []
	print "load model..."
	model_dm = Doc2Vec.load(model_path)
	print "load train vectors..."
	for text, label in data:
		vector = model_dm.infer_vector(text)
		infered_vectors_list.append(vector)
	'''
	print "Check the optimized parameter..."
	Nc = range(1, 50)
	pca_data = [PCA(n_components = i).fit(infered_vectors_list).transform(infered_vectors_list) for i in Nc]
	kmeans = cluster.KMeans(init='k-means++',n_clusters=20,max_iter=300)
	score = [kmeans.fit(pca_data[i]).score(pca_data[i]) for i in range(len(pca_data))]
	print score
	plt.plot(Nc,score)
	plt.xlabel('PCA components')
	plt.ylabel('Score')
	plt.title('Elbow Curve')
	plt.show()
	'''

	print "PCA decomposition..."
	pca = PCA(n_components = 10).fit(infered_vectors_list)
	pca_data = pca.transform(infered_vectors_list)
	print "train K-Means model..."
	kmean_model = cluster.KMeans(init='k-means++',n_clusters=16,max_iter=300)
	kmean_model.fit(pca_data)
	#get the classified index
	result = kmean_model.fit_predict(pca_data)
	print "Predicting result:", result
	#save the cluster result
	joblib.dump(kmean_model, cluster_path)
	#load the cluster result
#	new_km = joblib.load(cluster_path)
	numSamples = len(pca_data) 
	print numSamples
	centroids = kmean_model.labels_
	
	#print centroids,type(centroids) #显示中心点
	#print kmean_model.inertia_  #显示聚类效果
	'''	
	marker = ['o', '.', ',', 'x', '*', 'd', 's', 'p']
	color = ['r', 'g', 'b', 'c', 'm', 'k', 'y', 'w']
	for i in xrange(numSamples):
		plt.scatter(pca_data[i][0], pca_data[i][1], \
				marker=marker[centroids[i]], color=color[centroids[i]])
	plt.show()
	'''
	return centroids
示例#16
0
def main():
    #load data set
    training_reviews = load_dataset(TRAIN_FILE)
    testing_reviews = load_dataset(TEST_FILE)

    #load doc2vec model
    doc2vec_model = Doc2Vec.load(DOC2VEC_MODEL)

    cate_index = get_all_categories(training_reviews)
    cates = dict2list(cate_index)
    n_cates = len(cates)

    train_X = get_X(training_reviews, doc2vec_model)
    test_X = get_X(testing_reviews, doc2vec_model)

    train_labels = get_labels(training_reviews, cate_index)
    test_labels = get_labels(testing_reviews, cate_index)

    labelwise_acc = []
    labelwise_output = []

    for cate in range(n_cates):
        # train a bonary model
        train_Y = get_Y(train_labels, cate)
        prob = svm_problem(train_Y, train_X)
        param = svm_parameter("-s 0 -t 2 -b 1")
        m = svm_train(prob, param)

        # test
        test_Y = get_Y(test_labels, cate)
        p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1')

        labelwise_acc.append(p_acc)
        labelwise_output.append(p_label)

    # evaluation
    p, r, f = microF1(labelwise_output, test_labels)

    # output
    out_dir = "../data/use_doc2vec/"
    out_file = out_dir + "laptop.txt"
    labelwise_acc = [(cates[i], labelwise_acc[i][0]) for i in range(n_cates)]
    labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1])
    with open(out_file, 'w') as out:
        out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f))
        print("{}\n{}\n{}".format(p, r, f))
        for cate_i in range(n_cates):
            out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))
def get_distances_subset(n_closest, category_hash_with_doc_ids, csv_path):
    # example
    # category_hash_with_doc_ids = {"cat1":["us-1", "us-2"], "cat2": ["us-3"]}
    # loop over subjects and average docvecs belonging to subject.
    # place in dictionary
    model = Doc2Vec.load('../doc2vec_model')
    cpc_vectors  = get_category_vectors_subset(model, category_hash_with_doc_ids)
    distance_mat = get_distance_mat(cpc_vectors)

    to_csv = []
    for subj_id in list(category_hash_with_doc_ids.keys()):
        relateds = get_n_closest(distance_mat, subj_id, n=n_closest)
        for related_id, dist in relateds.iteritems():
            weight = round(1./dist)
            #weight = round((1-dist) * 10)
            row = (subj_id, related_id, weight, subj_id, related_id)
            to_csv.append(row)

    edges = pd.DataFrame(to_csv, columns=['source', 'target', 'weight', 'source_name', 'target_name'])
    edges.to_csv(csv_path, index=False)
示例#18
0
def test():
	global english_punctuation, model_path
	new_model = Doc2Vec.load(model_path)
#	sentence = "reserve setup_data: [mem 0x000000008f889018-0x000000008f8bc057] usable"
#	sentence = "efi: mem14: type=2, attr=0xf, range=[0x000000008fa17000-0x000000008fb19000) (1MB)"
#	sentence = "pci 0000:07:08.2: [8086:208d] type 00 class 0x088000"
#	sentence = "i40e 0000:b0:00.2: irq 41 for MSI/MSI-X"
	sentence = "ata8: SATA link up 6.0 Gbps (SStatus 133 SControl 300)"
	#tokenize
	test_tokenized = [word.lower() for word in word_tokenize(sentence)]
	#remove stopwords
	english_stopwords = stopwords.words('english')
	test_stopwords = [word for word in test_tokenized if not word in english_stopwords]
	#remove punctuation
	test_punctuation = [word for word in test_stopwords if not word in english_punctuations]
	#stem words
	#st = PorterStemmer()   
	#test_stemmed = [st.stem(word) for word in test_punctuation]
	test_text = test_punctuation
	print "===>Testing sentence:", test_text
	inferred_vector_dm = new_model.infer_vector(test_text)
	sims = new_model.docvecs.most_similar(positive=[inferred_vector_dm])
	return sims
示例#19
0
	def __init__(self, model_name=None, corpus=None, stop_words=False, filename=None, **kwargs):
		"""
		model_name: name of the model which has been trained and saved
		corpus: dictionary with 'question' and 'answer', where corpus['question'] is a list of TaggedDocuments
		filename: name of file containing the questions dataset
		"""
		if corpus:
			self.corpus = corpus
		else:
			self.corpus = {}
			self.corpus['question'] = list(self.read_corpus(filename['question'], stop_words=stop_words))
			self.corpus['answer'] = list(self.read_corpus(filename['answer'], stop_words=stop_words))

		if model_name:
			self.model = Doc2Vec.load(model_name)

		else:
			size = kwargs.get('size', 50)
			min_count  = kwargs.get('min_count', 5)
			alpha = kwargs.get('alpha', 0.025)
			min_alpha = kwargs.get('min_alpha', 0.025)
			iters = kwargs.get('iters', 10)

			self.train(size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, iters=iters)
    n closest subject_ids
    """
    s = distance_mat.loc[subject_id]
    closest = s.sort_values()[1:1+n]
    return closest


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description="produce similarity matrix")
    parser.add_argument('dbname', help="Database name")
    parser.add_argument('path_to_model', help="Model to test")
    parser.add_argument('n_closest', help="How many closest subjects to look into")
    args = parser.parse_args()

    model = Doc2Vec.load(args.path_to_model)

    subject_hash = get_subject_hash(args.dbname)
    subject_ids = list(subject_hash.keys())

    # loop over subjects and average docvecs belonging to subject.
    # place in dictionary
    subject_vectors = get_subject_vectors(subject_ids)
    distance_mat = get_distance_mat(subject_vectors)

    to_csv = []
    for subj_id in subject_ids:
        relateds = get_n_closest(distance_mat, subj_id, n=int(args.n_closest))
        for related_id, dist in relateds.iteritems():
            weight = round(1./dist)
            #weight = round((1-dist) * 10)
示例#21
0
def binarizator(x, coeff):
    if x > coeff:
        return 1
    else:
        return 0


def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))


data_rout = r"./data/lingvo_test"
models_rout = r"./models"

# load models:
d2v_model = Doc2Vec.load(
    os.path.join(models_rout, 'bss_doc2vec_model_20200611_draft'))
print("d2v_model load Done")

keras.losses.contrastive_loss = contrastive_loss
lstm_model = load_model(
    os.path.join(models_rout, 'siamese_model_d2v_nn_2020_0612.h5'))
print("lstm_model load Done")

with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f:
    lingv_model = pickle.load(f)

tk_appl = TokenizerApply(Loader(lingv_model))

tx1 = "сдавать ндс"
tx2 = "сдавать ндфл"
# tx1 = 'срок камеральной проверки по ндс заявленной к вычету'
import feature_extractor
from gensim.models.doc2vec import Doc2Vec
import parser
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import pdb
import pickle


model = Doc2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary = True)
print "MODEL LOADED"

f = open('stopwords.txt')
stoplist = set(line.split('\n')[0] for line in f)

def filter_essay(essay):
    stop_removed = filter(lambda x: x not in stoplist, essay.split())
    all_filtered = filter(lambda x: x in model.vocab, stop_removed)
    return all_filtered

def filter_essays(essays):
    return [filter_essay(essay) for essay in essays]



def calc_similarity(i1, i2):
    return model.n_similarity(i1, i2)

def classify(k, instance, training_data, training_scores):
    similarity = np.array([calc_similarity(instance, x) for x in training_data])
示例#23
0
def compute_vector(articles):
    model = Doc2Vec.load('./Model/Doc2Vec_Model')
    for article in articles:
        print(model.infer_vector(tag_article(article).words))
def get_vector(path, label):
    model = Doc2Vec.load(path)
    RES = []
    for i in range(len(label)):
        RES.append(model.docvecs['g_'+str(i)])
    return np.array(RES)
示例#25
0
    return docs


doc = generate_docs1()
print(accept[838:])
doclist = []
for i in range(doc_num):
    doclist.append(TaggedDocument(doc[i], ['상고 도로교통법_' + str(i)]))

doc_vectorizer = Doc2Vec(
    dm=0,  # PV-DBOW / default 1
    dbow_words=1,  # w2v simultaneous with DBOW d2v / default 0
    vector_size=300,  # vector size
    window=8,  # distance between the predicted word and context words
    alpha=0.025,  # learning-rate
    seed=1234,
    min_count=20,  # ignore with freq lower
    min_alpha=0.025,  # min learning-rate
    workers=4,  # multi cpu
    hs=1,  # hierarchical softmax / default 0
    negative=10,  # negative sampling / default 5
)

start = time.time()
doc_vectorizer.build_vocab(doclist)
for epoch in range(doc_vectorizer.iter):
    doc_vectorizer.train(doclist,
                         total_examples=doc_vectorizer.corpus_count,
                         epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay
示例#26
0
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from regression import BaseBowRegressor

import nltk

reviews_texts, _, _, _, _ = BaseBowRegressor.get_reviews_data(range(1, 70))
sentences = []
print "Tokenizing sentences..."
for i, review in enumerate(reviews_texts):
    tokens = nltk.word_tokenize(review)
    tokens = [token.lower() for token in tokens]
    sentences.append(LabeledSentence(words=tokens,
                                     labels=["REVIEW_" + str(i)]))

print "Doc2Vec"
model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4)
示例#27
0
import sanalytics.algorithms.utils as sau
from time import time
import numpy as np

## Read threshold
arg = sys.argv[1].split("|")
t = float(arg[0])
name = arg[1]
fold = int(arg[1].split("_")[-2])

## Import Data
X_train = pd.read_parquet("datasets/rq3_data/sec1.0_train.parquet")
X_val = X_train[X_train.fold==fold]

## Import D2V
d2v = Doc2Vec.load("datasets/kfold_d2v/{}.model".format(name))

## In pos set
def pos_set(str):
    if "|" in str: return False
    if "sse" in str: return True
    if "set1" in str: return True
    if "set2" in str: return True

## Predict functions
def predict(post, thresh, d2v):
    vec = d2v.infer_vector("{} {} {}".format(post.title, post.question, post.answers).split())
    sims = d2v.docvecs.most_similar([vec], topn=1000)
    return min(len([i for i in sims if pos_set(i[0]) and i[1] > thresh]), 1)

## Columns
示例#28
0
 def load(self, model_dir):
     model_path = self._get_model_path(model_dir)
     self.model = Doc2Vec.load(model_path)
def FeaturiseData(featureType,
                  trainDF,
                  testDF,
                  unsupervisedDF=None,
                  pickleObject=False,
                  reduceDims=500,
                  verbose=False):

    # convert to test and train,
    # this is needed here so as not to put training data into
    # the featurisation functions, since in reality they would be built
    # using only training dat

    if featureType == 'tfidf':
        # now we need to convert it into features so the data can be
        # put into a machine learning model
        vectorizer = TfidfVectorizer(stop_words='english',
                                     analyzer='word',
                                     min_df=0.02,
                                     max_df=0.98,
                                     use_idf=False,
                                     norm=None)

        #

        featureDataTrainTemp = vectorizer.fit_transform(
            trainDF['stemmed_text'])

        featureDataTrain = featureDataTrainTemp.todense()

        #featureDataTrain = (featureDataTrain - dataMean)/dataSD

        labelsTrain = np.array(trainDF['labels'])

        words = vectorizer.get_feature_names()

        # remember ONLY TRANSFORM, don't fit!!!
        featureDataTestTemp = vectorizer.transform(testDF['stemmed_text'])

        featureDataTest = featureDataTestTemp.todense()

        #featureDataTest = (featureDataTest - dataMean)/dataSD

        labelsTest = np.array(testDF['labels'])

        if verbose:
            startTime = datetime.now()
            print("reducing dims...")

        # 100 dims chosen arbitrarily...
        #featureData, _, _ = scipy.sparse.linalg.svds(featureDataTrainTemp, k = 100)

#        svdObj = TruncatedSVD(n_components=reduceDims, n_iter=7, random_state=42)
#
#        featureDataTrain = svdObj.fit_transform(featureDataTrainTemp)
#
#
#        #ONLY transform!!! do not fit
#        featureDataTest = svdObj.transform(featureDataTestTemp)
#

        if verbose:
            tookThisLong = datetime.now() - startTime
            print("SVD took %s " % str(tookThisLong))
            print("number of  words = ", len(words))

    elif featureType == "gensim":

        #gensimDF = pd.concat([trainDF, unsupervisedDF])

        # not sure if order is important so shuffle anyway, can't hurt...
        #gensimDF = gensimDF.sample(frac = 1)

        # convert the stemmed words into a format gensim can deal with
        documentsGensim = [
            TaggedDocument(doc, [i])
            for i, doc in enumerate(unsupervisedDF['stop_words_removed_list'])
        ]

        # build doc2vec model - this could do with some experimentation...
        modelGensim = Doc2Vec(documentsGensim,
                              vector_size=reduceDims,
                              window=4,
                              min_count=3,
                              workers=6)

        # now use the model to infer vectors
        docVecList = []
        labels = []
        for index, row in trainDF.iterrows():

            docVecList.append(
                modelGensim.infer_vector(row['stop_words_removed_list']))
            labels.append(row['labels'])

        featureDataTrain = np.array(docVecList)
        labelsTrain = np.array(labels)

        docVecList = []
        labels = []
        for index, row in testDF.iterrows():

            docVecList.append(
                modelGensim.infer_vector(row['stop_words_removed_list']))
            labels.append(row['labels'])

        featureDataTest = np.array(docVecList)
        labelsTest = np.array(labels)

        #print("labelsTest.shape = ", labelsTest.shape)

    if pickleObject:
        # pickle data
        dirname = os.path.dirname(__file__)

        # pickle train Data
        fileNameFeatureDataTrain = '../Data_Featurised/train_data_%s.pkl' % featureType
        fileNameFullFeatureDataTrain = os.path.join(dirname,
                                                    fileNameFeatureDataTrain)
        file = open(fileNameFullFeatureDataTrain, 'wb')
        pickle.dump(featureDataTrain, file)
        file.close()

        # pickle test Data
        fileNameFeatureDataTest = '../Data_Featurised/test_data_%s.pkl' % featureType
        fileNameFullFeatureDataTest = os.path.join(dirname,
                                                   fileNameFeatureDataTest)
        file = open(fileNameFullFeatureDataTest, 'wb')
        pickle.dump(featureDataTest, file)
        file.close()

        # pickle train labels
        fileNameFeatureDataLabelTrain = '../Data_Featurised/train_label_data_%s.pkl' % featureType
        fileNameFullFeatureDataTrainLabel = os.path.join(
            dirname, fileNameFeatureDataLabelTrain)
        file = open(fileNameFullFeatureDataTrainLabel, 'wb')
        pickle.dump(labelsTrain, file)
        file.close()

        # pickle test Data
        fileNameFeatureDataLabelTest = '../Data_Featurised/test_label_data_%s.pkl' % featureType
        fileNameFullFeatureDataTestLabel = os.path.join(
            dirname, fileNameFeatureDataLabelTest)
        file = open(fileNameFullFeatureDataTestLabel, 'wb')
        pickle.dump(labelsTest, file)
        file.close()

        if featureType == 'tfidf':

            #pickle tfidf vectorizer and truncated SVD
            fileNameTfidfObj = '../Feature_Models/tfidf_vect.pkl'
            fileNameFullTfidfObj = os.path.join(dirname, fileNameTfidfObj)
            file = open(fileNameFullTfidfObj, 'wb')
            pickle.dump(vectorizer, file)
            file.close()

#            #pickle tfidf vectorizer and truncated SVD
#            fileNameSvdObj = '../Feature_Models/svd_obj.pkl'
#            fileNameFullSvdObj = os.path.join(dirname, fileNameSvdObj)
#            file = open(fileNameFullSvdObj, 'wb')
#            pickle.dump(svdObj, file)
#            file.close()

        elif featureType == 'gensim':

            fileNameGensimObj = '../Feature_Models/gensim_obj.pkl'
            fileNameFullGensimObj = os.path.join(dirname, fileNameGensimObj)
            modelGensim.save(fileNameFullGensimObj)

    return featureDataTrain, featureDataTest, labelsTrain, labelsTest
示例#30
0
print(documents[7150])  # test

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in documents:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
         for text in documents]

# train the model
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
#print(documents)
model = Doc2Vec(documents, vector_size=35, window=2, min_count=2, workers=5)
model.train(documents, total_examples=model.corpus_count, epochs=150)
test_text = [
    'Silicon', 'bValley', 'Girlb', 'Apple', 'iPad', 'GB', 'yes', 'good',
    'enough', 'to', 'make', 'me', 'put', 'down', 'my', 'Kindle', 'Pampers',
    'Cruisers', 'Diapers', 'For', 'A', 'Snug', 'No', 'Gap', 'Fit', 'Nikon',
    'Coolpix', 'Digital', 'Camera', 'Not', 'Exactly', 'an', 'SLR', 'but',
    'Gets', 'the', 'Job', 'Done', 'Pampers', 'Swaddlers', 'for', 'Newborns',
    'Confusing', 'for', 'the', 'First', 'Time', 'Mom', 'TiVo', 'Humax',
    'DVDRRW', 'Series', 'Digital', 'Recorder', 'How', 'Do', 'I', 'Love',
    'Thee', 'MAC', 'Shadestick', 'for', 'Eyes', 'Easy', 'as', 'Pumpkin', 'Pie',
    'Postrio', 'Restaurant', 'San', 'Francisco', 'Still', 'Glamorous', 'Sonic',
    'Rio', 'Sport', 'S', 'MP', 'Player', 'Hours', 'of', 'skipfree', 'music',
    'for', 'Runners', 'Peter', 'Jacksons', 'Return', 'of', 'the', 'King',
    'Tears', 'and', 'Triumph', 'Propel', 'Fitness', 'Water', 'Atkins',
    'Friendlier', 'replacement', 'for', 'Gatorade', 'MAC', 'Paints', 'More',
示例#31
0
             u'ỦƯỨỪỬỮỰỲỴỶỸÝ'
regex = re.compile("[^{0}a-zA-Z0-9 ]".format(vn_accents))
document = regex.sub(" ", document)
# remove duplicated spaces
document = re.sub(' +', ' ', document)
# remove leading and trailing spaces
document = document.strip()
# lowering
document = document.lower()

with open("vocabulary.json", "r") as fr:
    vocab = json.load(fr)

vectorizer = TfidfVectorizer(vocabulary=vocab, max_features=10000)
tfidfvec = vectorizer.fit_transform([document])
docvec_model = Doc2Vec.load("doc2vec.model")
docvec = docvec_model.infer_vector(document.split())

tfidfvec = tfidfvec.toarray()
tfidfvec.shape = (1, 10000)
docvec.shape = (1, 60)

prefix = "./pretrained/multiview/"
list_files = glob.glob("{}*".format(prefix))
# model_path = max(list_files, key=os.path.getctime)
fnames_acc = []
for file in list_files:
    fname_acc = re.findall(r"([\d\.]+)\.hdf5", file)
    fnames_acc.append(float(fname_acc[0]))
fnames_acc.sort()
max_acc = "{:.4f}".format(fnames_acc[-1])
示例#32
0
            F1 = 2 * Recall * Precision / (Recall + Precision)

        result = {'params': {'window': window, 'min_count': min_count, 'vector_size': vector_size, 'alpha': alpha, 'min_alpha': min_alpha, 'epochs': epochs},
                  'score': {'Accuracy': Accuracy, 'Precision': Precision, 'Recall': Recall, 'F1': F1}}
        return result


if __name__ == '__main__':
    warnings.filterwarnings('ignore', category=FutureWarning)

    dataset_dir = './static/processed/v3/'

    norm_train = list(read_train_dataset(dataset_dir + 'norm-train.jsonl'))
    anom_train = list(read_train_dataset(dataset_dir + 'anom-train.jsonl'))

    model = Doc2Vec(norm_train + anom_train, dm=1, window=2, min_count=1, vector_size=300, alpha=0.08, min_alpha=0.01, epochs=600, workers=6)

    norm_train_vecs = [model.docvecs['norm'+str(i)] for i in range(len(norm_train))]
    anom_train_vecs = [model.docvecs['anom'+str(i)] for i in range(len(anom_train))]

    x_train = norm_train_vecs + anom_train_vecs
    y_train = ['norm']*len(norm_train_vecs) + ['anom']*len(anom_train_vecs)

    clf = RandomForestClassifier(random_state=0, n_estimators=50, max_depth=23, max_features=100, n_jobs=6)
    clf.fit(x_train, y_train)

    # testing
    norm_test_vecs = []
    norms = list(read_test_dataset(dataset_dir + 'norm-test.jsonl'))
    for norm in norms:
        norm_test_vecs.append(model.infer_vector(norm['words']))
示例#33
0
def train_doc2vec(data_frame, patent_ids, classif_level, classif_type):
    root_location = fh.get_root_location("data/lstm_outcome/")
    doc2vec_model_save_location = fh.join_paths(root_location, "doc2vec_model/")

    preprocessed_location = fh.join_paths(root_location, "preprocessed_data/separated_datasets/")
    training_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "training_docs_data_preprocessed/")
    validation_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "validation_docs_data_preprocessed/")
    test_preprocessed_files_prefix = fh.join_paths(preprocessed_location, "test_docs_data_preprocessed/")

    vocab_path = fh.join_paths(doc2vec_model_save_location, "vocab_model")

    training_docs_iterator = create_tuple_array(data_frame, patent_ids, text_batch_size=10000)

    #####
    tagged_data = training_docs_iterator
    cores = multiprocessing.cpu_count()
    model_dbow = Doc2Vec(dm=1, vector_size=200, window=2, negative=10, sample=1e-8, hs=0, min_count=50,
                         alpha=0.25, min_alpha=0.05, dbow_words=0, seed=1234, concat=0, workers=cores)
    model_dbow.build_vocab([x for x in tqdm(tagged_data)])

    for epoch in range(30):
        # model_dbow.train(utils_shuffle_rows([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1)
        model_dbow.train(utils_shuffle_rows([x for x in tqdm(tagged_data)]), total_examples=len(tagged_data), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha

    date = datetime.datetime.now().isoformat()
    model_dbow.save(fh.link_paths(vocab_path, 'doc2vec_vocab_30_epochs'))
    #####

    params = wmh.get_parameters_lstm_doc2vec()
    GLOBAL_VARS.DOC2VEC_MODEL_NAME, placeholder_model_name, doc2vec_model = wmh.get_lstm_doc2vec(params, classif_level, classif_type)

    # yields a list of sentences id, text as a tuple or (id, tuple)
    # training_docs_iterator = lrh.BatchWrapper(training_preprocessed_files_prefix, text_batch_size=10000, level=classif_level,
    #                                       level_type=classif_type)
    doc2vec_model.build_vocab(documents=training_docs_iterator, progress_per=params[13])
    doc2vec_model.save(fh.link_paths(vocab_path, "doc2vec_vocab"))

    DOC2VEC_ALPHA_DECREASE = wmh.set_alpha_parameters_lstm_doc2vec(doc2vec_model)
    start_epoch = 1

    # for epoch in range(1, params[11] + 1):
    #     GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
    #     doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME)
    #     if fh.ensure_exists_path_location(fh.link_paths(doc2vec_folder_path, "doc2vec_model")):
    #         start_epoch = epoch

    # if start_epoch > 1:
    #     GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(start_epoch)
    #     doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME)
    #     # if a model of that epoch already exists, we load it and proceed to the next epoch
    #     doc2vec_model = Doc2Vec.load(fh.link_paths(doc2vec_folder_path, "doc2vec_model"))
    #     start_epoch += 1

    ## The Actual Training
    for epoch in range(start_epoch, params[11] + 1):
        print("### epoch "+str(epoch)+" ###")
        # set new filename/path to include the epoch
        GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch)
        doc2vec_folder_path = fh.join_paths(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME)
        # train the doc2vec model
        # training_docs_iterator = lrh.BatchWrapper(training_preprocessed_files_prefix, text_batch_size=10000, level=classif_level,
        #                                 level_type=classif_type) # yields a list of sentences id, text as a tuple or (id, tuple)

        doc2vec_model.train(documents=training_docs_iterator, total_examples=len(training_docs_iterator),
                            report_delay=params[12], epochs=params[10])
        doc2vec_model.alpha -= DOC2VEC_ALPHA_DECREASE  # decrease the learning rate
        doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay
        doc2vec_model.save(fh.link_paths(doc2vec_folder_path, "doc2vec_model"))

    if epoch != params[11]:
        print("still training epochs missing: " + str(epoch))
        sys.exit(1)
示例#34
0
 def __init__(self):
     self.model=Doc2Vec.load('./model.d2v')
     self.st = LancasterStemmer()
示例#35
0
def get_doc2vec_model(model_path):
    return Doc2Vec.load(model_path)
示例#36
0
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

doc = ["test 1"]

tokenized_doc = ['ok']
tokenized_doc

print(doc)

#%%
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
tagged_data

model = Doc2Vec(tagged_data,
                vector_size=20,
                window=2,
                min_count=1,
                workers=4,
                epochs=100)

model.save("test_doc2vec.model")

model = Doc2Vec.load("test_doc2vec.model")

model.wv.vocab

#%% Soal no 4
import re
import os
unsup_sentences = []

for dirname in [
import json
import time
import random
import string
import numpy as np
from gensim.models.doc2vec import Doc2Vec
import anyconfig

config = anyconfig.load(open("config.yaml", 'rb'))
model = Doc2Vec.load('doc2vec/model')

keys = list(config["label"]["id2value"].keys())[1:]

dic_label = {}
for i, key in enumerate(keys):
    dic_label[key] = i + 1
print(dic_label)


def aug(shapes):
    add_shapes = []
    rm_shapes = []
    for shape in shapes:
        points = shape['points']
        points = sorted(points)
        if points[0][1] > points[1][1] and points[2][1] > points[3][1]:
            points = [points[1], points[3], points[2], points[0]]
        elif points[0][1] > points[1][1] and points[2][1] < points[3][1]:
            points = [points[1], points[2], points[3], points[0]]
        elif points[0][1] < points[1][1] and points[2][1] > points[3][1]:
            points = [points[0], points[3], points[2], points[1]]
def main():
    stats_Fscore, stats_recall, stats_precision  = list(), list(), list()
    data_pos, data_neg = load_data("../data/")
    data_pos, data_neg = data_filter(data_pos), data_filter(data_neg)
    model = Doc2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
    print "Model loading done!"
    for test_mode in range(2):
        if test_mode == 0:
            print "non-SMOTE"
        else:
            print "SMOTE"
        sFscores, sRecalls, sPrecisions = list(), list(), list()
        for iteration in range(NUM_OF_ITERATION): # start iteration
            random.seed(iteration)
            random.shuffle(data_pos)
            random.shuffle(data_neg)
            data_pos_vec, data_neg_vec = feature_extraction_Doc2Vec(data_pos, data_neg, model) # convert to Word Vectors
            print len(data_pos_vec), len(data_neg_vec)
            models = {"SVC": sklearn.svm.SVC(), \
                      "Logit": sklearn.linear_model.LogisticRegression(), \
                      "DT": sklearn.tree.DecisionTreeClassifier(), \
                      "NBayes": sklearn.naive_bayes.GaussianNB(), \
                      "NNeighbors": sklearn.neighbors.nearest_centroid.NearestCentroid()}
            model_chosen = "SVC"
            accuracys, precisions, recalls, Fscores = cross_validationS(\
                data_pos_vec, data_neg_vec, models[model_chosen], num_cross=NUM_OF_CROSSFOLD, smote_flag=test_mode) # cross validation
            sFscores.extend(Fscores)
            sRecalls.extend(recalls)
            sPrecisions.extend(precisions)
        stats_Fscore.append(sFscores)
        stats_recall.append(sRecalls)
        stats_precision.append(sPrecisions)
    plt.figure()
    colors = ["red", "blue"]
    modes = ["no-SMOTE", "SMOTE"]
    for i in range(len(stats_Fscore)): # plot statistical summary
        plt.plot(stats_Fscore[i], marker='o', color=colors[i], label=modes[i]+"_Fscore")
        #plt.plot(stats_precision[i], marker='+', color=colors[i], label=modes[i]+"_precision")
        #plt.plot(stats_recall[i], marker='*', color=colors[i], label=modes[i]+"_recall")
    plt.ylim([0, 1.0])
    plt.legend(loc=4, borderaxespad=0.5)
    plt.ylabel("Scores")
    plt.xlabel("Data Sequence")
    plt.savefig("../results/"+model_chosen+"-ValidationStats.png")
    savefile_name = "../results/" + model_chosen + "-ValidationStats.txt"
    fp = open(savefile_name, 'w')
    print "******** Evaluation **********\n"
    fp.write("******** Evaluation **********\n")
    for test_mode in range(2): # print statistical evaluations
        stats_precision[test_mode].sort()
        stats_recall[test_mode].sort()
        stats_Fscore[test_mode].sort()
        p_median = stats_precision[test_mode][len(stats_precision)/2]
        r_median = stats_recall[test_mode][len(stats_recall)/2]
        f_median = stats_Fscore[test_mode][len(stats_Fscore)/2]
        iqr_p = stats_precision[test_mode][int(len(stats_precision)*0.75)] - stats_precision[test_mode][int(len(stats_precision)*0.25)]
        iqr_r = stats_recall[test_mode][int(len(stats_recall)*0.75)] - stats_recall[test_mode][int(len(stats_recall)*0.25)]
        iqr_f = stats_Fscore[test_mode][int(len(stats_Fscore)*0.75)] - stats_Fscore[test_mode][int(len(stats_Fscore)*0.25)]
        print modes[test_mode]
        fp.write(modes[test_mode]+'\n')
        print "\t p_median \t r_median \t f_median"
        fp.write("\t p_median \t r_median \t f_median \n")
        print "\t%.5f \t%.5f \t%.5f" % (p_median, r_median, f_median)
        fp.write("\t%.5f \t%.5f \t%.5f \n" % (p_median, r_median, f_median))
        print "\t iqr_p \t iqr_r \t iqr_f"
        fp.write("\t iqr_p \t iqr_r \t iqr_f \n")
        print "\t%.5f \t%.5f \t%.5f" % (iqr_p, iqr_r, iqr_f)
        fp.write("\t%.5f \t%.5f \t%.5f \n" % (iqr_p, iqr_r, iqr_f))
        print '\n'
示例#39
0
def compute_similarities_text_word2vec(train=False, learn_idx=None):

    QUERY = "SELECT * FROM lyrics LIMIT 5000"
    cur.execute(QUERY)

    print("let's go!")
    i = 0
    id_to_sentence = dict()
    for msd_track_id, mxm_track_id, word, count, is_test in cur.fetchall():
        id_to_sentence[msd_track_id] = id_to_sentence.get(
            msd_track_id, "") + (word + " ") * count

    from nltk.corpus import stopwords
    # download('stopwords')
    stop_words = set(stopwords.words('english'))
    for k, v in id_to_sentence.items():
        id_to_sentence[k] = ' '.join(
            [word for word in v.split() if word not in stop_words])
        id_to_sentence[
            k] = "Blank" if id_to_sentence[k] == "" else id_to_sentence[k]
    track_ids, sentences = zip(*id_to_sentence.items())

    from gensim.models import Word2Vec
    from gensim.models.doc2vec import Doc2Vec, TaggedDocument
    from gensim.test.utils import common_texts, get_tmpfile

    ############### Word2Vec #######################
    # path = get_tmpfile("word2vec.model")
    # model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
    # model = Word2Vec.load("word2vec.model")
    # model.save("word2vec.model")

    ############### Doc2Vec ########################
    from nltk.tokenize import word_tokenize

    def learn_model(learn_idx):

        # from nltk import download
        # download("punkt")
        documents = [
            TaggedDocument(words=word_tokenize(doc), tags=[i])
            for i, doc in enumerate(sentences)
        ]

        max_epochs = 10000
        vec_size = 300
        alpha = 0.040

        model = Doc2Vec(size=vec_size,
                        alpha=alpha,
                        min_alpha=0.00025,
                        min_count=1,
                        dm=0)
        # model = Doc2Vec(vector_size=vec_size, dm=0)
        model.build_vocab(documents)

        for epoch in range(max_epochs):
            print("iteration {0}".format(epoch))
            model.train(documents,
                        total_examples=model.corpus_count,
                        epochs=model.iter)
            # model.alpha -= 0.0001
            # model.min_alpha = model.alpha

        model.save("doc2vec_bow{}.model".format(learn_idx))

    if train:
        if learn_idx != None:
            learn_model(learn_idx)
    else:
        model = Doc2Vec.load("doc2vec_bow{}.model".format(learn_idx))

        # test_doc = word_tokenize(sentences[0])
        # v1 = model.infer_vector(test_doc)
        # print("V1_infer:",v1)

        def print_top_n(x, n):
            title, artist = msd_titles[track_ids[x]]
            print('Examining --{} by {}--'.format(title, artist))
            print(' '.join(sorted(id_to_sentence[track_ids[x]].split())))
            print("------")
            test_doc = word_tokenize(id_to_sentence[track_ids[x]])
            v1 = model.infer_vector(test_doc)
            # print("V1_infer:",v1)
            similar_doc = model.docvecs.most_similar([v1], topn=n)
            print("len(similar_doc):", len(similar_doc))
            i = 1
            for idx, similarity_degree in similar_doc:
                idx = int(idx)
                title, artist = msd_titles[track_ids[idx]]
                print()
                print("{}. {} by {} with similarity degree of {}".format(
                    i, title, artist, similarity_degree))
                print(' '.join(sorted(id_to_sentence[track_ids[idx]].split())))
                print()
                i += 1

        print_top_n(0, 5)
示例#40
0
    return torch.tensor(result)
#批量将文本转化维向量并返回

if __name__ == '__main__':
    datapath = 'C:/Users/13170/Desktop/dataone/'
    read = []
    for count in range(1,24825):
        txtopen = open(datapath+str(count)+'.txt')
        txtpre = txtopen.read()
        txtpre = Preprocessing(txtpre)
        read.append(txtpre)
        txtopen.close()

    documents = [TaggedDocument(read[i],[i]) for i in range(len(read))]

    model_dbow = Doc2Vec(documents=documents,dm=0,vector_size=500,workers=cpu_count(),alpha=0.025,min_alpha=0.025)
    model_dm = Doc2Vec(documents=documents, dm=1, vector_size=500, workers=cpu_count(), alpha=0.025, min_alpha=0.025)
    for epoch in range(10):
        begin = time.time()
        model_dbow.train(documents,total_examples=model_dbow.corpus_count,epochs=10)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha
        print("epoch:{e} running time: {n}".format(e=str(epoch), n=str(time.time() - begin)))

    for epoch in range(10):
        begin = time.time()
        model_dm.train(documents, total_examples=model_dm.corpus_count, epochs=10)
        model_dm.alpha -= 0.002
        model_dm.min_alpha = model_dm.alpha
        print("epoch:{e} running time: {n}".format(e=str(epoch), n=str(time.time() - begin)))
示例#41
0
def run_clustering_experiments(guide: Guide,
                               nice_dir: str,
                               vector_dir: str,
                               overwrite: bool = False,
                               vector_type=TFIDF_VECTORS,
                               cluster_type=TOPIC_CLUSTER,
                               max_samples=None,
                               num_runs=20,
                               vec_path: str = None):
    """
    Run the semi-supervised clustering experiments.

    This consists of:
        *
    """

    # A pseudorandom number generator is created,
    # then seeded, to ensure that the results are
    # replicable from run to run.
    r = random.Random(20)
    sorted_docs = sorted(guide.docs, key=lambda x: x.id)
    r.shuffle(sorted_docs)

    # Set the maximum number of samples to a default for
    # topics (20) or categories (50) if left unspecified
    if max_samples is None:
        max_samples = 20 if cluster_type == TOPIC_CLUSTER else 50

    # Load the spacy language model if we plan to
    # use the GloVe vectors.
    spacy_model = None
    if vector_type == GLOVE_VECTORS:
        spacy_model = spacy.load(
            '/home2/rgeorgi/python3/lib/python3.4/site-packages/en_core_web_lg/en_core_web_lg-2.0.0/'
        )

    w2v_model = None
    if vector_type == WORD2VEC_VECTORS:
        if os.path.splitext(vec_path)[1] in ['.bin', '.gz']:
            w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
                vec_path, binary=True)
        else:
            w2v_model = Doc2Vec.load(vec_path)
            # w2v_model = gensim.models.Word2Vec.load(vec_path)

    # -------------------------------------------
    # Outer loop
    #
    # To make sure that the clustering results are not a fluke of picking a given couple documents
    # to seed the clusters, a number of runs are performed in which the order of the document set
    # is varied, so that different example docs will be chosen
    # -------------------------------------------
    for run_num in range(1, num_runs):

        ordered_docs = []
        true_labels = []

        # sorted_docs = sorted_docs[window_size:] + sorted_docs[:window_size]
        r.shuffle(sorted_docs)

        # Make sure that the order of documents
        # and their labels is static for evaluation.
        for doc in sorted_docs:
            ordered_docs.append(doc)
            if cluster_type == CATEGORY_CLUSTER:
                true_labels.append(doc.category.category_id)
            else:
                true_labels.append(doc.topic.id)

        if vector_type == TFIDF_VECTORS:
            matrix = tfidf_matrix(ordered_docs, nice_dir)
        elif vector_type == GLOVE_VECTORS:
            matrix = glove_matrix(ordered_docs,
                                  nice_dir,
                                  vector_dir,
                                  spacy_model,
                                  overwrite=overwrite)
        elif vector_type == WORD2VEC_VECTORS:
            matrix = gensim_matrix(ordered_docs,
                                   nice_dir,
                                   vector_dir,
                                   w2v_model,
                                   overwrite=overwrite)

        # -------------------------------------------
        # Iterate over a different number of supervised
        # samples
        # -------------------------------------------

        for supervised_samples in range(0, max_samples + 1):

            # -------------------------------------------
            # Build the initial clusters
            # -------------------------------------------
            inits = init_cluster_dict(matrix.shape[1])
            samples_per_cluster = defaultdict(int)

            # Now, let's pick out some supervised
            # samples.
            for i, doc in enumerate(ordered_docs):
                topic_id = doc.topic.id
                category_id = doc.category.category_id

                label_key = category_id if cluster_type == CATEGORY_CLUSTER else topic_id

                if samples_per_cluster[label_key] <= supervised_samples:
                    v = matrix[i].toarray()[0, :] if not isinstance(
                        matrix[i], np.ndarray) else matrix[i]
                    inits[label_key] += v
                    samples_per_cluster[label_key] += 1

            for key in inits:
                if samples_per_cluster[key] > 1:
                    inits[key] /= samples_per_cluster[key]

            # -------------------------------------------
            # Now, do the clustering.
            # -------------------------------------------

            # If no samples are used, seed the clusters randomly.
            # otherwise, use the generated init vectors.
            if supervised_samples == 0:
                init = 'random'
            else:
                init = np.array([v for v in inits.values()])

            # Set the number of clusters based on the number of clusters
            # defined in the guide
            num_clusters = len(
                g.categories) if cluster_type == CATEGORY_CLUSTER else len(
                    g.topics)

            k = KMeans(
                n_clusters=num_clusters,
                random_state=5,
                init=init,
                n_init=1,
            )
            k.fit(matrix)

            rand_index = adjusted_rand_score(true_labels, k.labels_)

            # Finally, print out a CSV row for each iteration.
            csv = '{},{},{}'.format(run_num, supervised_samples, rand_index)
            print(csv)
示例#42
0

# In[92]:

reviewTrain = []
for i in text:
    reviewTrain.append(preprocess(i))
del text

# In[93]:

res = []
for i in reviewTrain:
    res.append(i)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(res)]
model = Doc2Vec(documents)

# In[94]:

reviewVec = np.array([])
for i in res:
    reviewVec = np.append(reviewVec, model.infer_vector(i))
reviewVec = reviewVec.reshape(len(res), int(reviewVec.shape[0] / len(res)))

# In[95]:

lr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
#
#
X_train, X_test, y_train, y_test = train_test_split(reviewVec[:trainSize, :],
                                                    trainLabel,
示例#43
0
models['EXT'] = CustomUnpickler(open('BigFiveModels/CON_model.pkl', 'rb')).load()
models['AGR'] = CustomUnpickler(open('BigFiveModels/AGR_model.pkl', 'rb')).load()
models['NEU'] = CustomUnpickler(open('BigFiveModels/NEU_model.pkl', 'rb')).load()

#Load the Clap Prediction Model with the 'simple' Lasso Regression model - CAUTION - It is not very accurate ! 

with open("./ClapPredictionModels/clap_prediction_model_lasso.pkl", 'rb') as file:
    clap_prediction_model_lasso = pickle.load(file)

column_for_regression=["sentence_count","title_word_count","average_word_count_per_sentence",
                      "text_word_count","vocab_count_excl_commonwords","imgs_per_1000words",
                      "FS_GradeScore","vids_per_1000words","polarity","subjectivity"]

#Load the pre-trained Doc2Vec Model trained on 200 sample Medium Data Science articles with 300 vec dimensions

Doc2VecModel= Doc2Vec.load("./ClapPredictionModels/Doc2Vec.model")

#Load the average document vector for the 37 out of the 200 reference articles that have > 5k Claps  

VH_Vec=load('./ClapPredictionModels/VH_Claps_Vector.npy')
H_Vec=load('./ClapPredictionModels/H_Claps_Vector.npy')
M_Vec=load('./ClapPredictionModels/M_Claps_Vector.npy')
L_Vec=load('./ClapPredictionModels/L_Claps_Vector.npy')
VL_Vec=load('./ClapPredictionModels/VL_Claps_Vector.npy')

def get_html(url):
    user_agent_list = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
                            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
                            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
                               rw_df['Madein_city'][idx - kag_len],
                               rw_df['Variety'][idx - kag_len],
                               str(idx)
                           ]))

max_epochs = 50
vec_size = 100  # Previous setup - 25
alpha = 0.025
window_size = 5
num_workers = 4
minimun_count = 1  # Previous setup - 2
model = Doc2Vec(
    vector_size=vec_size,
    window=window_size,
    alpha=alpha,
    min_alpha=0.00025,
    min_count=minimun_count,
    dm=1,  # PV-DM
    workers=num_workers,
    epochs=max_epochs)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    start = timer()
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
示例#45
0
import hug
from gensim.models.doc2vec import Doc2Vec
from gensim.utils import simple_preprocess
import re

model = Doc2Vec.load('models/wiki-latest')


@hug.get('/topicos', examples='frase=Vingadores são um grupo de super-heróis')
@hug.local()
def topicos(frase: str):
    """Informa os topicos de uma frase qualquer"""
    tokens = simple_preprocess(frase)
    inferred_vector = model.infer_vector(tokens)
    similars = model.docvecs.most_similar([inferred_vector], topn=10)

    return {
        'topicos': similars
    }


@hug.get(examples="expressao=homem está para rei como mulher está para")
def analogia(expressao: str):
    """Calcula uma analogia entre termos"""

    entry = '{0}'.format(expressao)
    math_symbol = "\+"
    analogy_symbol = "está para"

    # Case 1: user wants to do word math: word1 - word2 + word3
    positive = []
示例#46
0
from gensim.models.doc2vec import Doc2Vec
import os

from .heroku import *  # noqa

ALLOWED_HOSTS = ['0.0.0.0', '127.0.0.1:8000']

SECRET_KEY = 'f=fqwc&$zt_6rf8y45j1l7w!^e*%a_c)4sf+v*_uf%hwf5_*16'

# MODEL_FILE is the full path of the neural net model to be used.
# *Make sure the test file is not .gitignored*; it is needed for CI.
# However, production-quality models are too big for GitHub, so they should be
# .gitignored.
# MODEL_FILE defaults to the test model used for CI; because it is checked into
# the repo it should be present and is therefore a sensible default for local
# development. If you want to have a production-like environment, and to use
# a model that represents the entire database, get it separately; put it in
# hamlet/model/hamlet.model; and add DJANGO_USE_LIVE_MODEL=True to your .env.
modelpath = os.environ.get('DJANGO_MODEL_path', '')
if modelpath:
    MODEL_FILE = os.path.join(PROJECT_DIR, modelpath)
else:
    MODEL_FILE = os.path.join(PROJECT_DIR, 'testmodels', 'testmodel.model')

NEURAL_NET = Doc2Vec.load(MODEL_FILE)

# The string "PASSED" will pass any captcha.
# Don't use this in production!
# http://django-simple-captcha.readthedocs.io/en/latest/advanced.html#captcha-test-mode
CAPTCHA_TEST_MODE = True
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from gensim.models.doc2vec import Doc2Vec
import pykeyvi

docvecs_process_input_keyvi_index_file = "docvecs_urlid_url.kv"
output_data_path = "/raid/ankit/doc2vec/out_s_p_1M"
doc2vec_trained_model = 'pages_with_spaces.doc2vec'
_alpha, _min_alpha, _passes = (0.020, 0.001, 20)

print "Loading keyvi dictionaries ..."
keyvi_dict=pykeyvi.Dictionary("{}/{}".format(output_data_path, docvecs_process_input_keyvi_index_file))
print "Finished Loading key-vi Dictionary."

print "Loading Doc2Vec Model ... "
model = Doc2Vec.load("{}/{}".format(output_data_path, doc2vec_trained_model))
print "Model Loaded Successfully!"


def get_similar_urls(sample_query, nearest_num):
    tokens = sample_query.lower().split()
    dv = model.infer_vector(tokens, alpha=_alpha, min_alpha=_min_alpha, steps=_passes)     # note: may want to use many more steps than default
    sims = model.docvecs.most_similar(positive=[dv],  topn=nearest_num)
    for url_id, distance in sims:
        url = ""
        for m in keyvi_dict.Get(str(url_id)):
            url = m.GetValueAsString()
        print "{}\t{}\t{}".format(url_id, url, distance)

def main():
    print "\nSimilar URLS for Queries - Doc2Vec Retrieval Interface [All URL's]"
示例#48
0
def similarity_scores(df, meth):
    from collections import defaultdict
    from gensim import corpora

    from gensim.models.doc2vec import Doc2Vec, TaggedDocument
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords

    textsList = df[['description', 'title']].values.T.tolist()
    textsList_flat = [item for sublist in textsList for item in sublist]

    documents = textsList_flat

    if meth == 'd2v':
        # Doc2Vec preprocessing
        tagged_data = [
            TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
            for i, _d in enumerate(documents)
        ]  #this is sufficient for a word-order--conserving model where we will retain punctuation

        #Doc2Vec

        #run the model
        max_epochs = 10
        vec_size = 25
        alpha = 0.03

        model = Doc2Vec(size=vec_size,
                        alpha=alpha,
                        min_alpha=0.00025,
                        min_count=2,
                        dm=1)

        model.build_vocab(tagged_data)

        for epoch in range(max_epochs):
            print('iteration {0}'.format(epoch))
            model.train(tagged_data,
                        total_examples=model.corpus_count,
                        epochs=model.iter)
            # decrease the learning rate
            model.alpha -= 0.00025
            # fix the learning rate, no decay
            model.min_alpha = model.alpha

        model.save("d2v_mixed7.model")
        print("Model Saved")

        # compute cosine similarity
        cossims = []
        for i in range(len(df)):
            cossimil = model.docvecs.similarity(i, (len(df) + i))
            cossims.append(cossimil)

    elif meth == 'lda' or meth == 'lsi':
        # BOW model preprocessing

        # Split the document into tokens
        from gensim.utils import simple_preprocess

        def sent_to_words(sentences):
            for sentence in sentences:
                yield (simple_preprocess(str(sentence), deacc=True))

        texts = list(sent_to_words(documents))

        # Remove common words and words that are only one character.
        stoplist = set('for a of the and to in'.split())
        #stoplist = set(stopwords.words('english'))
        texts = [[
            token for token in doc
            if (len(token) > 1) and (token not in stoplist)
        ] for doc in texts]

        # Lemmatize the documents.
        from nltk.stem.wordnet import WordNetLemmatizer

        lemmatizer = WordNetLemmatizer()
        texts = [[lemmatizer.lemmatize(token) for token in doc]
                 for doc in texts]

        # Lemmatized reduction with spaCy package, to keep only certain word-classes (noun, adjective, verb, adverb) i.e. remove prepositions etc
        # function from https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
        def lemmatization(texts,
                          allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
            texts_red = []
            for sentence in texts:
                doc = nlp(" ".join(sentence))
                texts_red.append([
                    token.lemma_ for token in doc
                    if token.pos_ in allowed_postags
                ])
            return texts_red

        import spacy
        nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        texts = lemmatization(texts,
                              allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

        # Filter carefully to remove rarest words (occurring in less than 15 documents), or common lemmas (more than 60% of the documents)
        dictionary = corpora.Dictionary(texts)
        dictionary.filter_extremes(no_below=15, no_above=0.6)

        # Construct the final corpus, bag-of-words representation of documents
        corpus = [dictionary.doc2bow(text) for text in texts]

        #Run the model
        if meth == 'lda':

            #LDA

            from gensim import models

            lda_model = models.LdaModel(corpus=corpus,
                                        id2word=dictionary,
                                        alpha='auto',
                                        eta='auto',
                                        iterations=10,
                                        passes=2,
                                        num_topics=100,
                                        eval_every=None,
                                        decay=0.8,
                                        offset=1)
            corpus_lda = lda_model[corpus]

            # compute cosine similarity
            from gensim.matutils import cossim
            cossims = []

            for i in range(len(df)):
                cossimil = cossim(corpus_lda[i], corpus_lda[len(df) + i])
                cossims.append(cossimil)

        else:

            #LSI (with TFIDF)

            from gensim import models

            tfidf = models.TfidfModel(
                corpus,
                smartirs='npc',  #probabilistic idf
                slope=0.2
            )  #lower slope means longer documents are favoured more (usually an effective choice for TFIDF)
            corpus_tfidf = tfidf[corpus]
            lsi_model = models.LsiModel(corpus_tfidf,
                                        id2word=dictionary,
                                        num_topics=300,
                                        power_iters=2)
            corpus_lsi = lsi_model[corpus_tfidf]

            # compute cosine similarity
            from gensim.matutils import cossim
            cossims = []

            for i in range(len(df)):
                cossimil = cossim(corpus_lsi[i], corpus_lsi[len(df) + i])
                cossims.append(cossimil)

    else:
        print("Please provide a valid method ('lda', 'lsi', 'd2v')")

    df_sims = df.assign(sim=cossims)
    return df_sims
示例#49
0
文件: wsd.py 项目: gsi-upm/sematch
 def load(cls, model_file='synset2vec'):
     model = Doc2Vec.load(model_file)
     return cls(model)
示例#50
0
def load_model(model_path):
    model = Doc2Vec.load(os.path.join(
        os.path.dirname(os.path.abspath('__file__')), model_path)
    )
    return model
示例#51
0
        vec.append(float(el))
    return vec


# load w2v model and w2v vectors
model_path = 'araneum_none_fasttextcbow_300_5_2018/araneum_none_fasttextcbow_300_5_2018.model'
model_w2v = Word2Vec.load(model_path)

broken_vectors = pd.read_csv('model_vec_w2v.csv').text_vec
model_vectors_w2v = broken_vectors.apply(make_list)

# load d2v model and d2v vectors
broken_vectors_d2v = pd.read_csv('model_vectors_d2v.csv').text_vec
model_vectors_d2v = broken_vectors_d2v.apply(make_list)

model_d2v = Doc2Vec.load('doc2vec.model')

# open texts
texts = list(pd.read_csv('corpus.csv').text)

# open inverted index
inv_ind = json.load(open('invind.json'))


@app.route('/', methods=['GET'])
def index():
    if request.args:
        query = request.args.get('query')
        search_method = request.args.get('search_method')
        search_result = search(query, search_method)
        return render_template('result.html',
示例#52
0
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 30 20:54:58 2019

@author: Athan
"""
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize
model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)
    
示例#53
0
            model_DM.train(training_doc)
            model_DBOW.train(training_doc)

        # Save the trained models:
        fout = 'DM.d2v'
        model_DM.save(most_recent + fout)
        model_DM.init_sims(replace=True)

        fout = 'DBOW.d2v'
        model_DBOW.init_sims(replace=True)
        model_DBOW.save(most_recent + fout)

    else:
        # Load Doc2Vec model from disk:
        fout = 'DM.d2v'
        model_DM = Doc2Vec.load(most_recent + fout)

        fout = 'DBOW.d2v'
        model_DBOW = Doc2Vec.load(most_recent + fout)


    # train the two different methods of the Doc2Vec algorithm:
    # NB DBOW is more similar to the recommended skip-gram of 
    # Word2Vec by the original paper's authors.  
 

    print('nonmatch', model_DM.doesnt_match("delay government flooding lightning".split()))

    print('nonmatch', model_DM.doesnt_match("euref voteout remain lightning".split()))
    print('euref sim by word', model_DM.similar_by_word('euref'))
    print('flood ', model_DM.similar_by_word('flood'))
示例#54
0
        'job': str,
        'description': str,
        'others': str
    },
    error_bad_lines=False)
job_description_df.describe()
job = job_description_df['job']
jobno = job_description_df['jobno']
job_in_now = job.tolist()
jobno_now = jobno.tolist()
yy = job_in_now[9210]
print(len(job_in_now))
print(yy)
#載入training好的wiki_seg_job_0728_true_model.txt,接著由test_data_1放入需要測試的文字,可找到
model = Doc2Vec.load(
    '/Users/liouscott/Documents/scott/104_competition/model/wiki_seg_job_0728_true_model.txt'
)
test_data_1 = '工讀生'
test_cut_raw_1 = []
item2 = (pseg.cut(test_data_1))
for k in list(item2):
    test_cut_raw_1.append(k.word)
inferred_vector = model.infer_vector(test_cut_raw_1)
sims = model.docvecs.most_similar([inferred_vector], topn=20)
sims_two = np.dot(model.docvecs[6682], model.docvecs[50021])
print(sims)  #sims是一个tuples,(index_of_document, similarity)
print(sims_two)
print(content[9210])
print(len(description_in))
print(len(model.docvecs))
print(model.docvecs)
    if deps_model_file != "":
        has_deps_embeddings = True
        logging.info("Loading dependency embeddings from %s" % deps_model_file)
        deps_model = Embeddings.load(deps_model_file+".npy", deps_model_file+".vocab")
        logging.info("Deps Model loaded!")

        #deps_vocabulary = deps_model._vocab
        #deps_embeddings = deps_model._vecs


    # Load Models here
    is_doc2vec_model = False
    # load word2vec word2vec_model
    if doc2vec_model_file != '':
        model = Doc2Vec.load(doc2vec_model_file)
        is_doc2vec_model = True
    else:
        if word2vec_load_bin:
            model = Word2Vec.load_word2vec_format(word2vec_model_file, binary=True)  # use this for google vectors
        else:
            model = Word2Vec.load(word2vec_model_file)

    use_id_for_vector = use_id_for_vector and is_doc2vec_model

    word2vec_num_features = len(model.syn0[0])
    logging.info("Embeddings feature vectors length:%s" % word2vec_num_features)
    logging.info("Model syn0 len=%d" % (len(model.syn0)))

    # define classes
    class_mapping = dict([(val, idx) for idx, val in enumerate(valid_senses)])
示例#56
0
def nlp():
    tagged_data = []
    stemmer = SnowballStemmer("hungarian")
    hu = detect("nagyon szertém ha működnél köszi puszi")
    #   assert gensim.models.doc2vec.FAST_VERSION > -1
    conn = sqlite3.connect(
        r'C:\Users\Domos\Documents\andris disszertacio\url.db')
    curr = conn.cursor()
    curr.execute(
        """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2014%";"""
    )
    ps14 = ""
    for row in curr.fetchall():
        urlrow = str(row)
        if urlrow == "":
            continue
        urlrow = urlrow.replace('\\xa0', ' ')
        urlrow = urlrow.replace('\xa0', ' ')
        urlrow = urlrow.replace('\\n', "")
        urlrow = urlrow.replace('\\r', "")
        urlrow = urlrow.replace('\\', "")
        urlrow = urlrow.replace('\\xadt', " ")
        urlrow = urlrow.replace('\\t', "")
        urlrow = urlrow.replace('(', "")
        urlrow = urlrow.replace(')', "")
        urlrow = urlrow.replace("'", "")
        urlrow = urlrow.replace(',', "")
        urlrow = urlrow.replace('[', "")
        urlrow = urlrow.replace(']', "")
        try:
            detect(urlrow) == hu
        except:
            continue
        else:
            if detect(urlrow) == hu:
                ps14 += urlrow
    print("ps14 done")
    curr.execute(
        """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2015%";"""
    )
    ps15 = ""
    for row in curr.fetchall():
        urlrow = str(row)
        if urlrow == "":
            continue
        urlrow = urlrow.replace('\\xa0', ' ')
        urlrow = urlrow.replace('\xa0', ' ')
        urlrow = urlrow.replace('\\n', "")
        urlrow = urlrow.replace('\\r', "")
        urlrow = urlrow.replace('\\', "")
        urlrow = urlrow.replace('\\xadt', " ")
        urlrow = urlrow.replace('\\t', "")
        urlrow = urlrow.replace('(', "")
        urlrow = urlrow.replace(')', "")
        urlrow = urlrow.replace("'", "")
        urlrow = urlrow.replace(',', "")
        urlrow = urlrow.replace('[', "")
        urlrow = urlrow.replace(']', "")
        try:
            detect(urlrow) == hu
        except:
            continue
        else:
            if detect(urlrow) == hu:
                ps15 += urlrow
    print("ps15 done")
    curr.execute(
        """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2016%";"""
    )
    ps16 = ""
    for row in curr.fetchall():
        urlrow = str(row)
        if urlrow == "":
            continue
        urlrow = urlrow.replace('\\xa0', ' ')
        urlrow = urlrow.replace('\xa0', ' ')
        urlrow = urlrow.replace('\\n', "")
        urlrow = urlrow.replace('\\r', "")
        urlrow = urlrow.replace('\\', "")
        urlrow = urlrow.replace('\\xadt', " ")
        urlrow = urlrow.replace('\\t', "")
        urlrow = urlrow.replace('(', "")
        urlrow = urlrow.replace(')', "")
        urlrow = urlrow.replace("'", "")
        urlrow = urlrow.replace(',', "")
        urlrow = urlrow.replace('[', "")
        urlrow = urlrow.replace(']', "")
        try:
            detect(urlrow) == hu
        except:
            continue
        else:
            if detect(urlrow) == hu:
                ps16 += urlrow
    print("ps16 done")
    curr.execute(
        """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2017%";"""
    )
    ps17 = ""
    for row in curr.fetchall():
        urlrow = str(row)
        if urlrow == "":
            continue
        urlrow = urlrow.replace('\\xa0', ' ')
        urlrow = urlrow.replace('\xa0', ' ')
        urlrow = urlrow.replace('\\n', "")
        urlrow = urlrow.replace('\\r', "")
        urlrow = urlrow.replace('\\', "")
        urlrow = urlrow.replace('\\xadt', " ")
        urlrow = urlrow.replace('\\t', "")
        urlrow = urlrow.replace('(', "")
        urlrow = urlrow.replace(')', "")
        urlrow = urlrow.replace("'", "")
        urlrow = urlrow.replace(',', "")
        urlrow = urlrow.replace('[', "")
        urlrow = urlrow.replace(']', "")
        try:
            detect(urlrow) == hu
        except:
            continue
        else:
            if detect(urlrow) == hu:
                ps17 += urlrow
    print("ps17 done")
    curr.execute(
        """SELECT DISTINCT paragaph FROM 'psArticle_tb' where time like "%2018%";"""
    )
    ps18 = ""
    for row in curr.fetchall():
        urlrow = str(row)
        if urlrow == "":
            continue
        urlrow = urlrow.replace('\\xa0', ' ')
        urlrow = urlrow.replace('\xa0', ' ')
        urlrow = urlrow.replace('\\n', "")
        urlrow = urlrow.replace('\\r', "")
        urlrow = urlrow.replace('\\', "")
        urlrow = urlrow.replace('\\xadt', " ")
        urlrow = urlrow.replace('\\t', "")
        urlrow = urlrow.replace('(', "")
        urlrow = urlrow.replace(')', "")
        urlrow = urlrow.replace("'", "")
        urlrow = urlrow.replace(',', "")
        urlrow = urlrow.replace('[', "")
        urlrow = urlrow.replace(']', "")
        try:
            detect(urlrow) == hu
        except:
            continue
        else:
            if detect(urlrow) == hu:
                ps18 += urlrow
    print("ps18 done")
    a = [ps14, ps15, ps16, ps17, ps18]

    for j, _k in enumerate(a):
        words = []
        w = word_tokenize(_k.lower())
        for word in w:
            words.append(stemmer.stem(word))
        tags = [str(j)]
        tagged_data += [TaggedDocument(words, tags)]
    print(tagged_data)

    max_epochs = 2
    vec_size = 300
    alpha = 0.025

    model = Doc2Vec(size=vec_size,
                    alpha=alpha,
                    min_alpha=0.00025,
                    min_count=1,
                    dm=1)
    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        model.alpha -= 0.0002
        model.min_alpha = model.alpha

    model.save("ps.model")
    print("Model Saved")

    model = Doc2Vec.load("ps.model")
    #to find the vector of a document which is not in training data
    print(model.wv.most_similar("migráns"))
    print(model.docvecs.most_similar([1]))
from pyspark import SparkContext
from pyspark.sql import SQLContext

from gensim.models.doc2vec import Doc2Vec

sc = SparkContext()
sqlContext = SQLContext(sc)

# this is a large object we cache it on each worker node
gmod_broadcast = sc.broadcast( Doc2Vec.load("/root/doc2vec/doc2vec_model/hn") ) 

df = sqlContext.read.load("hdfs:///hndata/parquet_typed", format="parquet")

ids = df.where("score IS NOT NULL") \
         .where("type='story'") \
         .where("title IS NOT NULL") \
         .map(lambda row: row.id)

def mergeVec(id):
    gmod = gmod_broadcast.value 
    vec = gmod.docvecs["TITLE_%d" % id]
    return (id, vec)
    
docvecs = ids.map(mergeVec) 
docvecs.saveAsPickleFile("hdfs:///hndata/docvecs_glove_pickle")
示例#58
0
def load_doc2vec_model():
    fname = get_tmpfile(os.path["DIR_PATH"] +
                        "/data/doc2vec/v2/doc2vec_articles_181030")
    model = Doc2Vec.load(fname)
    return model
示例#59
0
def mkExistingTrainedModel(path):
    return Doc2Vec.load(path)
示例#60
0
tagged_data = [
    TaggedDocument(words=word_tokenize(sentence.lower()),
                   tags=[sentence.lower()]) for sentence in df['DESC']
]
df['TAGGED_DATA'] = tagged_data

#tagged data contains TaggedDocuments with word having the tokens and the tag as the same as the cleaned sentence,so that we can identidy it later

max_epochs = 100
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1)

model.build_vocab(tagged_data)
for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#to get a sentence vector,use the cleaned form of the sentence as an index into model.docvecs
#to get vector for df['DESC'][0] use model.docvecs[df['DESC'][0]] or model[df['DESC'][0]]