for para in doc.paragraphs: text += para.text #summary=model(text) corpus.append(text) #### stop and stem not applied here #summarizedtext.append(summary) filenameraw.append([os.path.split(doc)[1].split('.')[0]]) except: print(doc) #********************************************************************************* #****************creating tagged corpus tagged_cr = [] for idx, doc in enumerate(corpus): tagged_cr.append(TaggedDocument(words=doc.split(), tags=[idx])) #*********model prep model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=1, epochs=80, alpha=0.025) model.build_vocab(tagged_cr) model.train(tagged_cr, total_examples=model.corpus_count, epochs=model.epochs) #vector = model.infer_vector(['dancing', 'reading', 'theatre', 'machine', 'learning', 'skills']) #print(vector) ##Overfitting Test ranks = [] second_ranks = []