def get_docs(self, doc_is): g.debug(f"Retrieving {len(doc_is)} documents...", 2) doc_ids = self._doc_ids[doc_is] query = f""" SELECT {self._id_column}, {self._text_column} FROM {self._table_name} WHERE {self._id_column} IN ({", ".join([str(x) for x in doc_ids])}) """ conn = get_connection() with conn.cursor(name="doc_getter") as cursor: cursor.itersize = g.DOC_BUFFER_SIZE cursor.execute(query) ids = [] docs = [] for id, doc in cursor: ids.append(id) if self._strip_html: doc = text_processing.strip_html(doc) docs.append(doc) # return the docs in the same order they were requested result = np.array([docs[ids.index(doc_id)] for doc_id in doc_ids]) return result
def cache_wordclouds(corpus, vocabulary, H, W): n_topics = H.shape[0] g.debug(f"Caching word clouds for {n_topics} topics...") topic_tfidf_weights = get_tfidf_topic_weights(corpus.tfidf_corpus, W) total = n_topics * 2 complete = 0 g.progress_bar(complete, total) for topic_i in range(n_topics): # nmf wordcloud wc = build_word_cloud(H[topic_i], vocabulary) wc.to_file(f"output/wordclouds/{str(topic_i).rjust(3, '0')}_nmf.png") complete += 1 g.progress_bar(complete, total) # tfidf wordcloud if topic_tfidf_weights[topic_i].sum(): wc = build_word_cloud(topic_tfidf_weights[topic_i], vocabulary) else: # an empty topic... wc = build_word_cloud([1], ["This topic was empty"]) wc.to_file(f"output/wordclouds/{str(topic_i).rjust(3, '0')}_tfidf.png") complete += 1 g.progress_bar(complete, total) g.debug(" -> Done", 1)
def export_ics(self, path): globals.debug("exporting ics file to " + path + " ...") cal = icalendar.Calendar() for d in self.__days: for event in d.get_next_event(): cal_event = icalendar.Event() cal_event.add('summary', event.get_name()) cal_event.add( 'dtstart', dt.datetime(d.get_year(), d.get_month(), d.get_day(), globals.to_hours(event.get_start())[0], globals.to_hours(event.get_start())[1])) cal_event.add( 'dtend', dt.datetime(d.get_year(), d.get_month(), d.get_day(), globals.to_hours(event.get_end())[0], globals.to_hours(event.get_end())[1])) cal_event.add('location', event.get_place()) cal.add_component(cal_event) ics_file = open(path, 'wb') ics_file.write(cal.to_ical()) ics_file.close() globals.debug("finished export!")
def create_from_transform(self, vectorizer): self.query_n_docs() g.debug("Downloading and vectorizing documents...") self._tfidf_corpus = vectorizer.transform(self.query(), n_docs=self._n_docs) g.debug(" -> Done", 1) return self
def dump_features(word_list, table_name): g.debug("Writing word list to features.txt...") with open(f"../data/{table_name}/pickles/features.txt", "w") as f: for word in word_list: f.write(word + "\n") g.debug(f" -> Wrote {len(word_list)} to file!", 1)
def search_models(tfidf_corpus, min_topics, max_topics, threshold=.333): g.debug("Building NMF topics...") # nmf_models = [] costs = [] H_similarities = [] W_similarities = [] tfidf_similarities = [] max_strength = [] min_strength = [] avg_strength = [] n_models = max_topics - min_topics + 1 g.progress_bar(0, n_models) try: for i in range(min_topics, max_topics + 1): nmf, W, H = nmf_model(tfidf_corpus, i, max_iter=666, no_output=True) # nmf_models.append(nmf) costs.append(nmf.reconstruction_err_**2) H_similarities.append( 1 - pairwise_distances(H, metric="cosine", n_jobs=-1).mean()) W_similarities.append( 1 - pairwise_distances(W, metric="cosine", n_jobs=-1).mean()) W_normalized = W / W.max(axis=0) tfidf_similarities.append( np.mean([ pairwise_distances( tfidf_corpus[W_normalized[:, topic_i] > threshold].A, metric="cosine", n_jobs=-1).mean() for topic_i in range(i) if (W_normalized[:, topic_i] > threshold).any() ])) values = np.array([W[x, y] for x, y in np.transpose(W.nonzero())]) max_strength.append(values.max()) min_strength.append(values.min()) avg_strength.append(values.mean()) g.progress_bar(i - min_topics + 1, n_models, text=f"{nmf.n_iter_} iterations") except KeyboardInterrupt: completed = len(tfidf_similarities) costs = costs[:completed] H_similarities = H_similarities[:completed] W_similarities = W_similarities[:completed] max_strength = max_strength[:completed] min_strength = min_strength[:completed] avg_strength = avg_strength[:completed] return costs, H_similarities, W_similarities, tfidf_similarities, max_strength, min_strength, avg_strength
def build_model_and_wordclouds(n_topics, tfidf_corpus, vocabulary, table_name): nmf, W, H = nmf_model(tfidf_corpus, n_topics, max_iter=666) pickle_dump(nmf, f"../data/{table_name}/pickles/NMF.pkl") pickle_dump(W, f"../data/{table_name}/pickles/W.pkl") corpus_topics = get_corpus_top_topics(W) build_word_clouds(tfidf_corpus, corpus_topics, H, vocabulary, table_name) g.debug("Done!")
def create_topic_model(corpus, n_topics): g.debug(f"Extracting {n_topics} latent topics...") nmf = NMF(n_components=n_topics, max_iter=g.MAX_ITER, random_state=g.RANDOM_STATE) W = nmf.fit_transform(corpus.tfidf_corpus) g.debug(f" -> Done in {nmf.n_iter_} iterations", 1) return nmf, W
def build_word_cloud(vocab_weights, vocabulary): g.debug("Building wordcloud...", 2) wc = WordCloud(background_color=g.BACKGROUND_COLOR, max_words=g.MAX_WORDS, width=g.WIDTH, height=g.HEIGHT) wc.fit_words \ ({vocabulary[word_i]: vocab_weights[word_i] for word_i in range(len(vocab_weights)) if vocab_weights[word_i]}) g.debug(" -> Done", 3) return wc
def nmf_model(corpus_tfidf, n_topics, max_iter=500, no_output=False): if not no_output: g.debug(f"Sorting corpus into {n_topics} topics...") nmf = NMF(n_components=n_topics, max_iter=max_iter, random_state=666) W = nmf.fit_transform(corpus_tfidf) H = nmf.components_ if not no_output: g.debug(f" -> {nmf.n_iter_} iterations completed!", 1) return nmf, W, H
def init(): init_api_keys() # surpress TensorFlow Warnings tf.logging.set_verbosity(tf.logging.FATAL) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' logging.basicConfig(level=logging.DEBUG) logging.getLogger("planner").setLevel(logging.INFO) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("rasa").setLevel(logging.INFO) atexit.register(exit_handler) # if possible restore TelegramManager state globals.debug("loading telegram manager...") global tel_man tel_man = TelegramManager(api_key=TELEGRAM_API_KEY) try: tel_man.restore(TG_STORAGE_PATH) except Exception: pass globals.debug("loading done!") # if possible restore RASA-Model globals.debug("loading rasa model...") global rasa_model rasa_model = RasaModelHandler(RASA_MODEL_PATH) globals.debug("loading done!")
def get_connection(credentials_path="data/json/db_creds.json"): global _connection if not _connection: debug("Connecting to Postgres database...") creds = json.load(open(credentials_path, "r")) _connection = psycopg2.connect(host=creds["host"], dbname=creds["database"], user=creds["username"], password=creds["password"]) debug(" -> Connection successful!", 1) return _connection
def query_n_docs(self): g.debug("Loading corpus...") count_query = f""" SELECT COUNT(*) FROM {self._table_name} WHERE {self._where_clause} """ conn = get_connection() with conn.cursor() as cursor: cursor.execute(count_query) self._n_docs = cursor.fetchone()[0] g.debug(f" -> Found {self._n_docs} documents") return self.n_docs
def build_word_clouds(corpus_tfidf, corpus_topics, H, word_list, table_name): g.debug("Generating topic word clouds...") n_topics = H.shape[0] completed = 0 g.progress_bar(completed, n_topics) topic_tfidf_weights = get_tfidf_topic_weights(corpus_tfidf, corpus_topics, n_topics) topic_top_tfidf_words_i = np.argsort(topic_tfidf_weights, axis=1)[:, ::-1] topic_top_nmf_words_i = np.argsort(H, axis=1)[:, ::-1] for topic_i in range(n_topics): # nmf wordcloud wc = WordCloud(background_color="black", max_words=333, width=1000, height=500) wc.fit_words({ word_list[word_i]: H[topic_i, word_i] for word_i in topic_top_nmf_words_i[topic_i] if H[topic_i, word_i] }) wc.to_file(f"../output/{table_name}/nmf/{topic_i}_nmf_wordcloud.png") # an empty topic... if not topic_tfidf_weights[topic_i].sum(): continue # tf-idf wordcloud wc = WordCloud(background_color="black", max_words=333, width=1000, height=500) wc.fit_words({ word_list[word_i]: topic_tfidf_weights[topic_i, word_i] for word_i in topic_top_tfidf_words_i[topic_i] if topic_tfidf_weights[topic_i, word_i] }) wc.to_file(f"../output/{table_name}/nmf/{topic_i}_tfidf_wordcloud.png") completed += 1 g.progress_bar(completed, n_topics) g.debug(f" -> {n_topics} word clouds generated!", 1)
def get_doc_by_id(self, doc_id): g.debug(f"Retrieving doc {doc_id}...", 2) query = f""" SELECT {self._text_column} FROM {self._table_name} WHERE {self._id_column} = {doc_id} """ conn = get_connection() with conn.cursor() as cursor: cursor.execute(query) doc = cursor.fetchone()[0] if self._strip_html: doc = text_processing.strip_html(doc) return doc
def import_ics(self, path): globals.debug("importing ics file from " + path + " ...") ics_file = open(path, 'rb') ical = icalendar.Calendar.from_ical(ics_file.read()) for ics_event in ical.walk(): if ics_event.name == "VEVENT": title = ics_event.get('summary') start = ics_event.decoded('dtstart') end = ics_event.decoded('dtend') place = ics_event.get('location') globals.debug("importing event " + title) #print(ics_event.decoded('dtstamp')) rrule = ics_event.get('rrule') if rrule: rrule_last = rrule['UNTIL'][0] curr_day = start new_event = Event( title, Event.EventType.SPECIFIC, start=globals.to_minutes(start.hour, start.minute), end=globals.to_minutes(end.hour, end.minute), place=place) while curr_day <= rrule_last: # try/except bc of colliding events -> should at least raise an error msg or sth. globals.debug("importing recurring event " + title + " on " + str(curr_day)) try: self.add_event( new_event, [curr_day.day, curr_day.month, curr_day.year]) except Exception: pass finally: if 'WEEKLY' in rrule['FREQ'][0]: curr_day = curr_day + dt.timedelta(days=7) else: break else: # try/except bc of colliding events -> should at least raise an error msg or sth. try: self.add_event(new_event, [start.day, start.month, start.year]) except: continue ics_file.close() globals.debug("finished import!")
def print_tfidf_topic_words(corpus_tfidf, corpus_topics, word_list, n_topics, n_words=10): topic_tfidf_weights = get_tfidf_topic_weights(corpus_tfidf, corpus_topics, n_topics) topic_top_words_i = np.argsort(topic_tfidf_weights, axis=1)[:, ::-1] top_words = [] for topic_i in range(n_topics): topic_words = word_list[topic_top_words_i[topic_i, :n_words]] top_words.append(topic_words) g.debug(f"TF-IDF words for topic {topic_i}:", 2) g.debug(str(topic_words), 2) return top_words
def create_new_vectorizer(self): self.query_n_docs() g.debug("Downloading and vectorizing documents...") vectorizer = vectorizers.TfidfVectorizerProgressBar( max_features=g.MAX_FEATURES, min_df=g.MIN_DF, max_df=g.MAX_DF, stop_words=vectorizers.get_stopwords(), tokenizer=vectorizers.tokenize, ngram_range=(1, g.N_GRAMS), strip_accents="ascii", sublinear_tf=True, dtype=np.uint16, progress_bar_clear_when_done=True) self._tfidf_corpus = vectorizer.fit_transform(self.query(), n_docs=self._n_docs) g.debug(f" -> Found {len(vectorizer.get_feature_names())} features") return vectorizer
def dump_topic_corpus(corpus_topics, corpus, doc_ids): """ Saves the corpus to output/<topic #>/<filename>.txt, organized by topic. :param W: The W matrix from NMF. :param corpus: The corpus to be saved :return: None """ g.debug("Saving summaries to disk based on topic...") for i in range(corpus_topics.size): path = os.path.join("output", str(corpus_topics[i]).rjust(2, "0")) if not os.path.isdir(path): os.mkdir(path) filename = os.path.join(path, f"{doc_ids[i]}.txt") with open(filename, "w") as f: f.write(corpus[i]) g.debug(f" -> {len(corpus)} files created!", 1)
def sumarize_corpus(corpus, vectorizer, n_sentences=10): """ Summarizes an entire corpus. Displays a progress bar. :param corpus: The corpus to be summarized :param vectorizer: The TF-IDF vectorizer to be used for feature extraction. :param n_sentences: Number of sentences to include in the summary. :return: A corpus of summaries """ g.debug("Summarizing documents...") summaries = [] n_docs = len(corpus) completed = 0 for doc in corpus: summaries.append(summarize_doc(doc, vectorizer, n_sentences)) completed += 1 g.progress_bar(completed, n_docs, 1) g.debug(f" -> {len(summaries)} documents summarized!", 1) return summaries
def cv_to_tfidf(cv_corpus, table_name, model_from_pickle): tfidf_corpus = None if model_from_pickle and os.path.exists(f"../data/{table_name}/pickles/TfidfTransformer.pkl"): tfidf_transformer = pickle_load(f"../data/{table_name}/pickles/TfidfTransformer.pkl") else: g.debug("Transforming to TF-IDF vector...") tfidf_transformer = TfidfTransformer(sublinear_tf=True) tfidf_corpus = tfidf_transformer.fit_transform(cv_corpus) g.debug(" -> Done!", 1) if tfidf_corpus is None: g.debug("Transforming corpus to TF-IDF...") tfidf_corpus = tfidf_transformer.transform(cv_corpus) g.debug(" -> Done!", 1) g.debug(f" -> {tfidf_corpus.shape[0]} count vectors with {tfidf_corpus.shape[1]} features transformed!", 1) return tfidf_transformer, tfidf_corpus
def write(self, buffer): n = len(buffer) g.debug(f" -> Writing {n} total bytes...", 3) idx = 0 while idx < n: batch_size = min(n - idx, 1 << 31 - 1) g.debug(f" ---> Writing bytes [{idx}, {idx + batch_size})... ", 3) self.f.write(buffer[idx:idx + batch_size]) g.debug(f" ---> Done", 3) idx += batch_size
from planner import plannerhandler as ph import settings import globals settings.init() from settings import tel_man, rasa_model survey_link = "https://forms.gle/1doFP9G4LbG83cuH9" remove_event_for_user_on_day = {} chat_save_it = 0 while True: # try: globals.debug('Fetching new messages...') if not tel_man.fetch_new_messages(): globals.debug('Fetching failed.') continue globals.debug('Fetching done!') globals.debug('Checking new messages...') for userid in tel_man.get_users(): globals.debug('Checking new messages for user ' + str(userid)) msgs = tel_man.get_new_messages(userid) files = tel_man.get_new_files(userid) callback_queries = tel_man.get_new_callback_queries(userid) if msgs: for msg in msgs:
def exit_handler(): globals.debug("exiting server...") globals.debug("saving telegram chatlogs...") tel_man.store_chatlog(TG_CHATLOG_PATH) globals.debug("saving done!") globals.debug("saving telegram manager...") tel_man.store(TG_STORAGE_PATH) globals.debug("saving done!") globals.debug("exiting done!")
def pickle_load(file_path): g.debug(f"Loading {file_path} from cache...") with open(file_path, "rb") as f: obj = pickle.load(BigFile(f)) g.debug(" -> Done", 1) return obj
def pickle_dump(obj, file_path): g.debug(f"Caching {file_path}...") with open(file_path, "wb") as f: result = pickle.dump(obj, BigFile(f), protocol=pickle.HIGHEST_PROTOCOL) g.debug(" -> Done", 1) return result
def count_vectorize(corpus, table_name, model_from_pickle, input_type="content"): cv_corpus = None if model_from_pickle and os.path.exists(f"../data/{table_name}/pickles/CountVectorizer.pkl"): count_vectorizer = pickle_load(f"../data/{table_name}/pickles/CountVectorizer.pkl") else: g.debug("Vectorizing documents...") count_vectorizer = CountVectorizerProgressBar(input=input_type, max_features=g.MAX_FEATURES, min_df=g.MIN_DF, max_df=g.MAX_DF, stop_words=get_stopwords(), tokenizer=tokenize, ngram_range=(1, g.N_GRAMS), strip_accents="ascii", dtype=np.uint16, progress_bar_clear_when_done=True) cv_corpus = count_vectorizer.fit_transform(corpus) count_vectorizer.stop_words_ = None # we can delete this to take up less memory (useful for pickling) g.debug(" -> Done!", 1) g.debug(f" -> Loaded vectorizer with {len(count_vectorizer.get_feature_names())} features!", 1) if cv_corpus is None: g.debug("Transforming corpus...") cv_corpus = count_vectorizer.transform(corpus) g.debug(" -> Done!", 1) g.debug(f" -> Loaded {cv_corpus.shape[0]} documents with {cv_corpus.shape[1]} features!", 1) return count_vectorizer, cv_corpus
def __insert_events_to_day(self, querent, events, day, first_event, insert_before=True): last_event = first_event globals.debug("evolving from event " + first_event.get_name()) if insert_before: globals.debug("inserting next events before") else: globals.debug("inserting next events after") while events: if day.num_events() >= self.__max_events: break event_places = [e.get_place() for e in events] last_place = last_event.get_place() globals.debug("starting api query...") if insert_before: arr_time = last_event.get_start() - (last_event.get_start() % 15) results = QuerentParser( querent.get_travel_details( origins=event_places, destinations=last_place, arrival_time=dt.datetime( day.get_year(), day.get_month(), day.get_day(), globals.to_hours(arr_time)[0], globals.to_hours(arr_time)[1]))) else: dep_time = last_event.get_end() + ( 15 - (last_event.get_start() % 15)) results = QuerentParser( querent.get_travel_details( origins=last_place, destinations=event_places, departure_time=dt.datetime( day.get_year(), day.get_month(), day.get_day(), globals.to_hours(dep_time)[0], globals.to_hours(dep_time)[1]))) globals.debug("received api response!") durations = results.get_durations() for i in range(len(durations)): globals.debug("durations: " + str(durations)) closest_event = durations.index(min(durations)) travel_time = results.get_durations()[closest_event] if insert_before: end_time = arr_time - travel_time start_time = end_time - events[closest_event].get_duration( ) travel_start = end_time events[closest_event].set_place( results.get_origins()[closest_event]) else: start_time = dep_time + travel_time end_time = start_time + events[closest_event].get_duration( ) travel_start = dep_time events[closest_event].set_place( results.get_destinations()[closest_event]) if (end_time >= self.__time_end_day or start_time < self.__time_begin_day): globals.debug("reached daily time limits!") durations[closest_event] = float("inf") if min(durations) == float("inf"): globals.debug( "no event fits between other event and time limits!" ) return events continue try: globals.debug("trying to insert event " + events[closest_event].get_name()) day.add_event(events[closest_event], start_time, end_time) globals.debug("inserted event!") if travel_time > 0: day.add_event( Event("[... travelling (" + str(travel_time) + "min) ...]", Event.EventType.TRAVELLING, start=travel_start, end=travel_start + travel_time, place=""), travel_start, travel_start + travel_time) globals.debug("removing inserted event") last_event = events[closest_event] del events[closest_event] break except: # collision exception while adding new event globals.debug( "two events collided! trying to insert next best event" ) durations[closest_event] = float("inf") if min(durations) == float("inf"): globals.debug("no event fits between other events!") return events continue return events
def replan(self, querent): globals.debug("Replaning all events...") # remove all previously planned (unspecific) events [d.remove_unspecific() for d in self.__days] # mark all events as unplanned self.__unplanned_events = [ e for e in self.__events if not e.is_specific() ] globals.debug("removed all previously planned events") for currDay in self.__get_next_day(): globals.debug("planning events on day " + str(currDay.get_day()) + "/" + str(currDay.get_month()) + "/" + str(currDay.get_year())) if currDay.num_specific_events() > 0: globals.debug("day has at least on specific event") first_it = True for sp_event in currDay.get_next_specific_event(): self.__unplanned_events = self.__insert_events_to_day( querent, self.__unplanned_events, currDay, sp_event, first_it) if first_it: first_it = False else: globals.debug("day has no specific events") # no specific events for current day self.__unplanned_events = self.__insert_events_to_day( querent, self.__unplanned_events, currDay, Event("Home", Event.EventType.SPECIFIC, self.__time_begin_day - 10, self.__time_begin_day, place=self.__home), insert_before=False) if not self.__unplanned_events: break globals.debug("finished planning for all events") #unplanned_events_str = ["- " + e.get_name() for e in self.__unplanned_events] #if not unplanned_events_str: # print("All events could be planned!") #else: # print("Still unplanned events:") # for s in unplanned_events_str: # print(" " + s) #print("\n") globals.debug("Replaning Done!")