def merge_categories_to_whole_set(path): COUNT = 1 name_list = get_namelist(path) init_catogery = 'null' list_tmp_to_store_sample = [] output = [] for i, line in enumerate(name_list): # print(i) classification = classification_extract(line[0]) file = line[1] if classification != init_catogery : list_tmp_to_store_sample_tmp = list_tmp_to_store_sample # random.shuffle(list_tmp_to_store_sample) list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) #random.shuffle list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.remove(None) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) if COUNT > 1 : # list_tmp_to_store_sample_tmp.remove(None) output.append(d2v.TaggedDocument(list_tmp_to_store_sample_tmp, [init_catogery])) COUNT += 1 list_tmp_to_store_sample_tmp = [] init_catogery = classification list_tmp_to_store_sample = [] # print(init_catogery,classification) if classification == init_catogery: with open(file, 'r',encoding="utf-8") as f: try: contents = f.readline() # For training data, add tags line_split = contents.split() if len(line_split) > 1 : list_tmp_to_store_sample.extend(line_split) else: pass continue except Exception as e: print(e) continue list_tmp_to_store_sample_tmp = list_tmp_to_store_sample # random.shuffle(list_tmp_to_store_sample) list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.remove(None) # list_tmp_to_store_sample_tmp.extend(list_tmp_to_store_sample) # list_tmp_to_store_sample_tmp.remove(None) output.append(d2v.TaggedDocument(list_tmp_to_store_sample_tmp, [init_catogery])) return output
def __iter__(self): document_id = 0 for file_ind, file_name in enumerate(self.files_list): try: fasta_sequences = SeqIO.parse( _open(os.path.join(self.input_folder, file_name)), 'fasta') seq_id = 0 for fasta in fasta_sequences: seq_id += 1 name, sequence = fasta.id, str(fasta.seq) documents_list = self._get_document_from_fasta( sequence, self.processing_mode, self.k, self.shift_size) for doc_ind, doc in enumerate(documents_list): yield doc2vec.TaggedDocument(doc, [document_id]) # Use same document_id for all sequences if non-overlapping document_id += 1 if file_ind % 1 == 0: print( f"Finished processing file #{file_ind}, file_name:{file_name}, number of genes: {seq_id} document_id: {document_id}" ) except Exception as e: print( f"****ERROR IN PARSING file: {file_name}, seq_id: {seq_id}," ) print(f"name: {name} sequence: {sequence}") print(f"Error message: {e}")
def tokenize(df, col, tokens_only=False): """ Given a DataFrame and a column, tokenizes the words in that column Parameters ---------- df: DataFrame dataframe with column to be tokenized col: str column name of text to be tokenized tokens_only: bool to train the doc2vec model, we’ll need to associate a tag/number with each document of the training corpus. tokens_only=True means don't associate anything Returns ---------- list tokenized words """ tokens = df[col].apply(lambda x: simple_preprocess(x, deacc=True, max_len=20)) # max_len=20 just in case there are important words 15 chars long) if tokens_only: return tokens else: # For training data, add tags -- notice it is just an index number return [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
def preprocess_lines(pdf_content, document_tag): """Preprocesses a string of words. For the moment, the step are: - removes everything but letters - splits into lines - converts to lower case - splits line into words - takes stems of words - takes out one and two caracter words - takes out stop words (provided by nltk) """ # take out numbers (for now) letters_only = re.sub("[^a-zA-Z\n]", " ", pdf_content) line_list = letters_only.split('\n') sentence_list = [] stem_dict = {} for line in line_list: words = line.lower().split() # stem words meaningful_words, stem_dict_line = stem_words(words) stem_dict.update(stem_dict_line) # take out one and two character words meaningful_words = [w for w in meaningful_words if len(w) > 2] # take out stop words meaningful_words = [w for w in meaningful_words if w not in stops] if len(meaningful_words) != 0: sentence_list.extend(meaningful_words) if sentence_list != []: tagged_document = doc2vec.TaggedDocument(sentence_list, tags=[document_tag]) return tagged_document, stem_dict else: return None, stem_dict
def __iter__(self): document_id = 0 for file_ind, file_name in enumerate(self.files_list): try: fasta_sequences = SeqIO.parse( _open(os.path.join(self.input_folder, file_name)), 'fasta') seq_id = 0 for fasta in fasta_sequences: x = random.random() # if x <= 0.5: # continue seq_id += 1 name, sequence = fasta.id, str(fasta.seq) documents_list = self._get_document_from_fasta(sequence) for doc_ind, doc in enumerate(documents_list): yield doc2vec.TaggedDocument(doc, [document_id]) if file_ind % 1 == 0: print( f"Finished processing file #{file_ind}, file_name:{file_name.replace('.fna.gz', '')}, number of genes: {seq_id} document_id: {document_id}" ) except Exception as e: print( f"****ERROR IN PARSING file: {file_name}, seq_id: {seq_id}," ) print(f"name: {name} sequence: {sequence}") print(f"Error message: {e}") document_id += 1
def train_doc2_vec( data_path, embed_size=64, epoch=10, min_count=1, window=4, workers=8, model_path="word2vec_entities_raw_agg_user.model", ): df_all = pd.read_csv(data_path) df_all.fillna("null", inplace=True) logging.info(f"Load All Data: {df_all.shape}") logging.info(f"Build Training Corpus for Doc2Vec") start = time.time() ex_entities = df_all["ex_entities"].tolist() train_corpus = [ doc.TaggedDocument(word, [idx]) for idx, word in enumerate(ex_entities) ] logging.info(f"Train Doc2Vec model") model = doc.Doc2Vec( vector_size=embed_size, min_count=min_count, epochs=epoch, window=window, workers=workers, ) model.build_vocab(train_corpus) model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) logging.info("finished ({:.2f} sec elapsed)".format(time.time() - start)) model.save(model_path)
def test_mixed_tag_types(self): """Ensure alternating int/string tags don't share indexes in doctag_syn0""" mixed_tag_corpus = [doc2vec.TaggedDocument(words, [i, words[0]]) for i, words in enumerate(raw_sentences)] model = doc2vec.Doc2Vec() model.build_vocab(mixed_tag_corpus) expected_length = len(sentences) + len(model.docvecs.doctags) # 9 sentences, 7 unique first tokens self.assertEquals(len(model.docvecs.doctag_syn0), expected_length)
def read_data(tag_with_genres=True): # read in dataframe info_df = pd.read_csv('album_info.csv', index_col=False) with open(os.path.join('corpus', filename), 'r') as f: # list of [id, 'sentence blahblah'] data = [line.strip().split('\t') for line in f] train_docs = [] for row in data: try: id_num = int(row[0]) split_words = row[1].split(' ') id_label = [id_num] # also add genre tags in training docs if tag_with_genres: id_label += get_label(id_num, info_df) # add the data train_docs.append((split_words, id_label)) except IndexError: print('read_data() IndexError: {}'.format(row)) tagged_train_docs = [ doc2vec.TaggedDocument(words=doc, tags=tag) for doc, tag in train_docs ] return tagged_train_docs
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : {iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, iterable of list of str} A collection of tagged documents used for training the model. Returns ------- :class:`~gensim.sklearn_api.d2vmodel.D2VTransformer` The trained model. """ if isinstance(X[0], doc2vec.TaggedDocument): d2v_sentences = X else: d2v_sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(X)] self.gensim_model = models.Doc2Vec( documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, trim_rule=self.trim_rule, vector_size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, epochs=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self
def label_sentences(corpus, label_type): labeled = [] for i, v in enumerate(corpus): doc = " ".join(v) label = label_type + '_' + str(i) labeled.append(doc2vec.TaggedDocument(doc.split(), [label])) return labeled
def get_filing_from_db(self): # read files from DB, clean them and create TaggedDocument conn = psycopg2.connect(settings.CONN_STRING) words = [] i = 0 for i in range(0, self.max_docs): pickle_file_name = settings.BASE_PATH_FILINGS + '{}.pickle'.format( self.file_list[i]) if os.path.isfile(pickle_file_name): print( 'Found pickle file {}, skipping cleaning/tokenization...'. format(pickle_file_name)) with open(pickle_file_name, 'rb') as f: words = pickle.load(f) else: cur = conn.cursor() cur.execute( 'select f.filing, f.date_filed from filings f where f.filing_id = %s', (self.filing_id_list[i], )) filing_text = '' for record in cur: filing_text = record[0] cur.close() print('Cleaning {}...'.format(self.file_list[i])) words = get_words_from_doc(filing_text, self.tokenizer) with open(pickle_file_name, 'wb') as f: pickle.dump(words, f) yield (doc2vec.TaggedDocument(words=words, tags=[self.file_list[i]])) conn.close()
def trainingModel(alljudgements, listOfTopics, sentencesForTraining): model = Doc2Vec(dm=1, min_count=1, window=10, vector_size=150, sample=1e-4, negative=10) #use all the extracted phrases of all files phrases = [] for line in alljudgements: phrases.append(line) for line in listOfTopics: phrases.append(line) for line in sentencesForTraining: phrases.append(line) sentences = [ doc2vec.TaggedDocument(sentence, 'tag') for sentence in phrases ] model.build_vocab(sentences) for epoch in range(500): model.train(sentences, epochs=model.epochs, total_examples=model.corpus_count) seconds = time.time() print("Seconds since epoch =", seconds) print("Epoch # {} is complete.".format(epoch + 1)) if (epoch % 30 == 0): #save model model.save('doc2vec2.model')
def my_doc2vec_model(doclist): reslist = [] for i, doc in enumerate(doclist): blob = TextBlob(doc) np = list(blob.noun_phrases) reslist.append(doc2vec.TaggedDocument(np, [i])) return reslist
def test(): MyUtils.init_logging("VectorizeDescriptions.log") docs_percent_touse = 1 # on the full training set, 0.3 is probably advisable. chunk_size = 10 ** 5 doc_filenames = [F.DESCDOCS] #, F.QADOCS_FILEPATH trainingset_ls = [] for doc_filename in doc_filenames: for descdocs_chunk in pd.read_csv(doc_filename, chunksize=chunk_size): len_c = len(descdocs_chunk) indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False))) selected_rows = descdocs_chunk.iloc[indices] docs = [] for tupl in selected_rows.itertuples(): docs.append(D2V.TaggedDocument(words=ast.literal_eval(tupl.words), tags=ast.literal_eval(tupl.tags))) trainingset_ls.extend(docs) logging.info("Reading in the documents' words. Chunk processed...") logging.info("Completed: reading in a set of documents.") d2v_model = load_model() subset = trainingset_ls[0:5] logging.debug("%s", str(subset)) for doc in subset: tag = doc.tags logging.debug("*** : %s" , str(tag)) logging.debug("XXX : %s" , str(tag[0])) logging.debug("%s",str(d2v_model.docvecs[tag[0]]))
def train_doc2vec_classifier(X1, X2, y, embedd): docs_token = [doc.split() for doc in X1] documents = [doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(docs_token)] # get vector representation of docs X_vector = [] for doc in docs_token: vector = embedd.infer_vector(doc) X_vector.append(vector) X = pd.concat([X2.reset_index(drop=True),pd.DataFrame(X_vector)], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75, random_state=1) #oversampling training set sm = SMOTE(sampling_strategy = 0.4) X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train) lr = LogisticRegression(solver="lbfgs", class_weight='balanced', max_iter=1000) lr.fit(X_train_sm, y_train_sm) y_hat = lr.predict(X_test) return lr, y_hat, X_test, y_test
def read_documents_for_doc2vec(tokens_only=False): for i, path_content in enumerate(read_files_walk()): tokens = utils.preprocess(path_content[1]) if tokens_only: yield path_content[0], tokens else: yield doc2vec.TaggedDocument(tokens, [i])
def create_tagged_documents(self, document_frame): permalinks = document_frame.permalink authors = document_frame.author tokens = document_frame.tokens return [ d2v.TaggedDocument(words=tks, tags=[author + '/' + permalink]) for tks, author, permalink in zip(tokens, authors, permalinks) ]
def tagNotes(noteDF, noteCol, freqDist, low=4, highPer=0.74): highCount = highPer * noteDF.shape[0] taggedNotes = [] for i, note in enumerate(nu.tokenize_and_stop(noteDF, noteCol)): clean_note = list( filter(lambda x: _check_word(x, freqDist, low, highCount), note)) taggedNotes.append(dv.TaggedDocument(clean_note, [i])) return taggedNotes
def testBuildVocabWarning(self, l): """Test if logger warning is raised on non-ideal input to a doc2vec model""" raw_sentences = ['human', 'machine'] sentences = [doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] model = doc2vec.Doc2Vec() model.build_vocab(sentences) warning = "Each 'words' should be a list of words (usually unicode strings)." self.assertTrue(warning in str(l))
def get_d2v_corpus(corpora): for i, line in enumerate(corpora): # For training data, add tags yield doc2vec.TaggedDocument([ tkn for tkn in line.encode('utf-8').decode('utf-8').lower().split() if tkn not in stopwords_en ], [i])
def tagged_sentences(self, sentences): tagged_sents = [] for i, sent in enumerate(sentences): tagged_sents.append(d2v.TaggedDocument(sent, ["sent_{}".format(i)])) print('tagged_sents:', len(tagged_sents)) return tagged_sents
def read_as_tagged_document(self, filename): """ Read in a single document as a TaggedDocument object :param filename: The name of the file :return: A TaggedDocument object """ f = path.join(self.__path, filename) content = self.read_preprocessed(filename) return doc2vec.TaggedDocument(content, [filename])
def build_corpus(file_pre): with open(file_pre + "_train.txt", 'r') as f: for line in f.readlines(): line = line.strip("\n") line = line.split("\t") tag = int(line[-1]) line_list = line[0].split(' ') # print(line_list,tag) yield d2v.TaggedDocument(line_list, [tag])
def __iter__(self): f = open(fileName) for l in f: tabIndex = l.find('\t') asin = l[:tabIndex] text = l[tabIndex+1:] cleaned = re.sub('[^\w]', ' ', text).lower() yield d2v.TaggedDocument(words = re.split("\s*", cleaned), tags = [asin]) f.close()
def __next__(self): try: data = next(self.corpus) t = self.__preprocess(data[1]) return doc2vec.TaggedDocument(t, [data[0]]) except StopIteration: self.rebot() raise StopIteration
def read_corpus(self): """ Get all documents in the current folder in TaggedDocuments type :return: generator of documents to product TaggedDocuments """ files = listdir(self.__path) for i, document in enumerate(files): content = self.read_preprocessed(document) yield doc2vec.TaggedDocument(content, [document])
def getCleanLabeledReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append(clear_review_to_words(review)) labelized = [] for i, id_label in enumerate(reviews["id"]): labelized.append(doc2vec.TaggedDocument(clean_reviews[i], [id_label])) return labelized
def tagged_iterator(text_iterator): """Processes texts in the doc_iterator and returns an iterator of tagged documents""" count = 0 for bow in text_iterator: if len(bow) > 0: yield doc2vec.TaggedDocument(bow, [count]) count += 1 print count - 1
def train_model_by_tokens(documents_in_tokens: Iterable[Iterable[str]], vector_size: int, min_count: int): documents = [ doc2vec.TaggedDocument(tokens, [i]) for i, tokens in enumerate(documents_in_tokens) ] return doc2vec.Doc2Vec(documents, vector_size=vector_size, min_count=min_count)
def train_doc2vec(self): x_train = [] for line in self.dataset: patent = json.loads(line) x_train.append( doc2vec.TaggedDocument(patent['words'], [patent['patent_number']])) self._train(x_train)