def preprocess(data):
    """Preprocess text data.
    """
    # Constructing corpus:
    textfiles = database.select("textfiles")
    documents = utils.get_documents(textfiles)
    corpus = cophi.model.Corpus(documents)
    num_tokens = corpus.num_tokens
    database.update("textfiles", num_tokens.to_dict())
    # Get paramter:
    D, W = corpus.dtm.shape
    N = num_tokens.sum()
    # Cleaning corpus:
    stopwords = utils.get_stopwords(data, corpus)
    hapax = corpus.hapax
    features = set(stopwords).union(set(hapax))
    logging.info("Cleaning corpus...")
    dtm = corpus.drop(corpus.dtm, features)
    # Save stopwords:
    database.insert_into("stopwords", json.dumps(stopwords))
    # Save parameters:
    parameters = {
        "n_topics": int(data["topics"]),
        "n_iterations": int(data["iterations"]),
        "n_documents": int(D),
        "n_stopwords": int(len(stopwords)),
        "n_hapax": int(len(hapax)),
        "n_tokens": int(N),
        "n_types": int(W)
    }
    return dtm, num_tokens.tolist(), parameters
示例#2
0
def export_data():
    """Export model output to ZIP archive.
    """
    logging.info("Creating data archive...")
    if DATA_EXPORT.exists():
        unlink_content(DATA_EXPORT)
    else:
        DATA_EXPORT.mkdir()
    model, stopwords = database.select("data_export")
    document_topic, topics, document_similarities, topic_similarities = model

    logging.info("Preparing document-topic distributions...")
    document_topic = pd.read_json(document_topic, orient="index")
    document_topic.columns = [
        col.replace(",", "").replace(" ...", "")
        for col in document_topic.columns
    ]

    logging.info("Preparing topics...")
    topics = pd.read_json(topics, orient="index")
    topics.index = ["Topic {}".format(n) for n in range(topics.shape[0])]
    topics.columns = ["Word {}".format(n) for n in range(topics.shape[1])]

    logging.info("Preparing topic similarity matrix...")
    topic_similarities = pd.read_json(topic_similarities)
    topic_similarities.columns = [
        col.replace(",", "").replace(" ...", "")
        for col in topic_similarities.columns
    ]
    topic_similarities.index = [
        ix.replace(",", "").replace(" ...", "")
        for ix in topic_similarities.index
    ]

    logging.info("Preparing document similarity matrix...")
    document_similarities = pd.read_json(document_similarities)
    data_export = {
        "document-topic-distribution": document_topic,
        "topics": topics,
        "topic-similarities": topic_similarities,
        "document-similarities": document_similarities,
        "stopwords": json.loads(stopwords)
    }

    for name, data in data_export.items():
        if name in {"stopwords"}:
            with Path(DATA_EXPORT,
                      "{}.txt".format(name)).open("w",
                                                  encoding="utf-8") as file:
                for word in data:
                    file.write("{}\n".format(word))
        else:
            path = Path(DATA_EXPORT, "{}.csv".format(name))
            data.to_csv(path, sep=";", encoding="utf-8")
    shutil.make_archive(DATA_EXPORT, "zip", DATA_EXPORT)
示例#3
0
def get_textfile_sizes():
    """Textfile sizes.
    """
    return database.select("textfile_sizes")
示例#4
0
def get_parameters():
    """Model parameters.
    """
    return json.dumps(database.select("parameters"))
示例#5
0
def get_token_frequencies():
    """Token frequencies per document.
    """
    return database.select("token_freqs")
示例#6
0
def get_stopwords():
    """Stopwords.
    """
    return database.select("stopwords")
示例#7
0
def get_textfile(title):
    """Textfiles.
    """
    return database.select("textfile", title=title)
示例#8
0
def get_topic_similarities():
    """Topic similarity matrix.
    """
    return database.select("topic_similarities")
示例#9
0
def get_document_similarities():
    """Document similarity matrix.
    """
    return database.select("document_similarities")
示例#10
0
def get_topics():
    """Topics.
    """
    return database.select("topics")
示例#11
0
def get_document_topic_distributions():
    """Document-topics distributions.
    """
    return database.select("document_topic_distributions")