def preprocess(data): """Preprocess text data. """ # Constructing corpus: textfiles = database.select("textfiles") documents = utils.get_documents(textfiles) corpus = cophi.model.Corpus(documents) num_tokens = corpus.num_tokens database.update("textfiles", num_tokens.to_dict()) # Get paramter: D, W = corpus.dtm.shape N = num_tokens.sum() # Cleaning corpus: stopwords = utils.get_stopwords(data, corpus) hapax = corpus.hapax features = set(stopwords).union(set(hapax)) logging.info("Cleaning corpus...") dtm = corpus.drop(corpus.dtm, features) # Save stopwords: database.insert_into("stopwords", json.dumps(stopwords)) # Save parameters: parameters = { "n_topics": int(data["topics"]), "n_iterations": int(data["iterations"]), "n_documents": int(D), "n_stopwords": int(len(stopwords)), "n_hapax": int(len(hapax)), "n_tokens": int(N), "n_types": int(W) } return dtm, num_tokens.tolist(), parameters
def export_data(): """Export model output to ZIP archive. """ logging.info("Creating data archive...") if DATA_EXPORT.exists(): unlink_content(DATA_EXPORT) else: DATA_EXPORT.mkdir() model, stopwords = database.select("data_export") document_topic, topics, document_similarities, topic_similarities = model logging.info("Preparing document-topic distributions...") document_topic = pd.read_json(document_topic, orient="index") document_topic.columns = [ col.replace(",", "").replace(" ...", "") for col in document_topic.columns ] logging.info("Preparing topics...") topics = pd.read_json(topics, orient="index") topics.index = ["Topic {}".format(n) for n in range(topics.shape[0])] topics.columns = ["Word {}".format(n) for n in range(topics.shape[1])] logging.info("Preparing topic similarity matrix...") topic_similarities = pd.read_json(topic_similarities) topic_similarities.columns = [ col.replace(",", "").replace(" ...", "") for col in topic_similarities.columns ] topic_similarities.index = [ ix.replace(",", "").replace(" ...", "") for ix in topic_similarities.index ] logging.info("Preparing document similarity matrix...") document_similarities = pd.read_json(document_similarities) data_export = { "document-topic-distribution": document_topic, "topics": topics, "topic-similarities": topic_similarities, "document-similarities": document_similarities, "stopwords": json.loads(stopwords) } for name, data in data_export.items(): if name in {"stopwords"}: with Path(DATA_EXPORT, "{}.txt".format(name)).open("w", encoding="utf-8") as file: for word in data: file.write("{}\n".format(word)) else: path = Path(DATA_EXPORT, "{}.csv".format(name)) data.to_csv(path, sep=";", encoding="utf-8") shutil.make_archive(DATA_EXPORT, "zip", DATA_EXPORT)
def get_textfile_sizes(): """Textfile sizes. """ return database.select("textfile_sizes")
def get_parameters(): """Model parameters. """ return json.dumps(database.select("parameters"))
def get_token_frequencies(): """Token frequencies per document. """ return database.select("token_freqs")
def get_stopwords(): """Stopwords. """ return database.select("stopwords")
def get_textfile(title): """Textfiles. """ return database.select("textfile", title=title)
def get_topic_similarities(): """Topic similarity matrix. """ return database.select("topic_similarities")
def get_document_similarities(): """Document similarity matrix. """ return database.select("document_similarities")
def get_topics(): """Topics. """ return database.select("topics")
def get_document_topic_distributions(): """Document-topics distributions. """ return database.select("document_topic_distributions")