def read_articles(): # connect to Hive and read the articles print('>>> reading articles from Hive...') cursor = hive.connect('quickstart.cloudera').cursor() hive_query = ("SELECT title, publishedAt, content FROM article " "WHERE unix_timestamp(publishedAt) > cast(CURRENT_TIMESTAMP AS BIGINT) - " "{lookback_period}" ) hive_query = hive_query.format( lookback_period=LOOKBACK_DAYS * 24 * 60 * 60) cursor.execute(hive_query) res = cursor.fetchall() print("<<< %d articles read" % len(res)) # convert the publication date to a date-time articles = pd.DataFrame(res, columns=['title', 'publishedAt', 'content']) articles['publishedAt'] = pd.to_datetime(articles['publishedAt']) articles['publishedAtHour'] = articles['publishedAt'].dt.strftime( "%Y-%m-%d %H") # cont how many articles published each hour articles_grouped = articles['title'].groupby(articles['publishedAtHour']) articles_cnt_hourly = articles_grouped.count() # draw a line-plot with the results # articles_cnt_hourly.plot.line() # count the most occuring words content = [c for c in articles['content'].tolist() if c is not None] tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(content) word_counts = loads(tokenizer.get_config()['word_counts']) most_frequent_words = {k: v for k, v in sorted( word_counts.items(), key=lambda item: item[1], reverse=True)} words, counts = zip(*most_frequent_words.items()) words = words[:MAX_PLOT_WORDS] counts = counts[:MAX_PLOT_WORDS] return articles_cnt_hourly, words, counts
default=5, help="Epochs to train the model.") args = parser.parse_args() return args for es in english_sentences: es = preprocess_sentence(es) for gs in german_sentences: gs = start_token + preprocess_sentence(gs) + end_token tokenizer = Tokenizer(filters='') tokenizer.fit_on_texts(german_sentences) config = tokenizer.get_config() word_index = json.loads(config['word_index']) index_words = json.loads(config['index_word']) num_samples = 5 inx = np.random.choice(len(english_sentences), num_samples, replace=False) print(inx) sequences = tokenizer.texts_to_sequences(german_sentences) padded = pad_sequences(sequences, padding='post', value=0) def map_embedding_f(x, y): inp = [] pad = tf.pad(x, paddings=[[13 - tf.shape(x)[0], 0][0, 0]], mode='CONSTANT') inp.append(pad)
sentence_data=string_data.split(".") #print(sentence_data) additional_filters='-''""' token = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + additional_filters, lower=True, split=" ", char_level=False, oov_token="UNK", document_count=0) token.fit_on_texts(sentence_data) tokenizer_config = token.get_config() print(tokenizer_config.keys()) #print(tokenizer_config["word_index"]) #print("\n\n\n\n\n\n\n") import json word_counts = json.loads(tokenizer_config['word_counts']) #print(word_counts) print(word_counts["the"]) index_word = json.loads(tokenizer_config['index_word']) word_index=json.loads(tokenizer_config["word_index"]) #print(sentence_data)
class Transformer(): def __init__(self): # try to load data from .env self.load_data() self.tokenizer = Tokenizer(num_words=1000, lower=True) return def load_data(self): """A function that takes the DATABASE_URL and fetches the contents of the strain_info table then saves it to a df for training the model. """ dotenv.load_dotenv() alt = 'DATABASE_URL' db_url = getenv("DATASOURCE", default=alt) engine = create_engine(db_url) df = pd.read_sql("SELECT * FROM strain_info", engine) self.df = df return df def transform(self, document: pd.DataFrame, negative: list, ignore: list) -> pd.DataFrame: """A function transforms the features from the input dataframe into a Document-term matrix then takes those Arguments: ------------- document {list} : An array like list of strings representing a document to be transformed negitive {list} : the list of negitive features to use in calculating the dtm products ignore {list} : a list of features to ignore in the dtm product Returns: ------------- combined_scaled {pd.DataFrame} : A dataframe of the transformed document's tfidf """ dtm = [0] * 1000 for i in document.columns: if i in ignore: pass else: # takes the document term frequency and if it is # a neg feature then we want to subtract it from the combined dtm if i in negative: dtm -= self.find_dtm(document[i]) # otherwise i want to add it to the combined dtm else: dtm += self.find_dtm(document[i]) mm = MinMaxScaler() combined_scaled_values = mm.fit_transform(dtm) combined_scaled_columns = dtm.columns.tolist() combined_scaled = pd.DataFrame(combined_scaled_values, columns=combined_scaled_columns) combined_scaled.fillna(0, inplace=True) return combined_scaled, document.index.tolist() def find_dtm(self, feature): """A function to take a feature and tokenize then return a tfidf df of that input """ self.tokenizer.fit_on_texts(feature) a = self.tokenizer.texts_to_matrix(feature, mode='tfidf') config = self.tokenizer.get_config() feature_names = json_normalize(loads( config['word_index'])).columns.tolist() dtm = pd.DataFrame(a) return dtm
Nwords = int(1e4) tokenizer = Tokenizer(num_words=Nwords, oov_token='<OOV>') XXX = 5000 reviews, lengths = [], [] i = 0 for dataset in [train_examples]: #, test_examples]: for x, y in tqdm(dataset): text = encoder.decode(x) reviews.append(text.replace('<br />', '')) lengths.append(len(text)) if i > XXX: break i += 1 tokenizer.fit_on_texts(reviews) word_index = tokenizer.word_index print(f'Tokenizer found {len(word_index)} different words') vocabulary = json.loads(tokenizer.get_config()['word_counts']) x, y = [], [] for k, v in vocabulary.items(): x.append(k) y.append(v) inds = np.argsort(y) inds = inds[-Nwords:][::-1] maxlen = 1100 sequences = tokenizer.texts_to_sequences(reviews) sequences = pad_sequences(sequences, padding='post', maxlen=maxlen) # with open('word_index.dict','w') as f: # for word,index in word_index.items(): # f.write(f'{word},{index}\n')
tokenizer.fit_on_texts(train_x_data['token']) low_count_words = [w for w, c in tokenizer.word_counts.items() if c < 5] for w in low_count_words: del tokenizer.word_index[w] del tokenizer.word_docs[w] del tokenizer.word_counts[w] train_sequence = tokenizer.texts_to_sequences(train_x_data['token']) test_sequence = tokenizer.texts_to_sequences(test_x_data['token']) sequence_data = dict() sequence_data['train_seq'] = train_sequence sequence_data['test_seq'] = test_sequence sequence_data['train_token_list'] = train_x_data['token'].tolist() sequence_data['test_token_list'] = test_x_data['token'].tolist() sequence_data['tokenizer_config'] = tokenizer.get_config() word_idx = tokenizer.word_index MAX_SEQUENCE_LENGTH = int(np.median(after_len)) DATA_OUT_PATH = './assets/data/npy_data/{}/'.format(Today) ## Make output save directory if os.path.exists(DATA_OUT_PATH): print("{} -- Folder already exists \n".format(DATA_OUT_PATH)) else: os.makedirs(DATA_OUT_PATH, exist_ok=True) print("{} -- Folder create complete \n".format(DATA_OUT_PATH)) train_input = pad_sequences(train_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post') train_labels = np.array(train_y)