Exemplo n.º 1
0
def run_hicocluster_create_matrix():
    # Number of docs: 1950
    # Number of items: 21826
    texts = JsonParser.get_texts(os.getcwd() + "\\clusters")
    newTexts = []
    for text in texts:
        newTexts.append(text.split())
    # print newTexts[0]

    dictionary = corpora.Dictionary(newTexts)
    dictionary.save(os.getcwd() + "\\dictionary.dict")

    corpus = [dictionary.doc2bow(text) for text in newTexts]
    corpora.MmCorpus.serialize(os.getcwd() + "\\corpus.mm", corpus)
    print "length of docs: " + str(dictionary.num_docs)
    print "length of items: " + str(len(dictionary.token2id.items()))

    features = len(dictionary.token2id.items())
    row = 1
    set_doc_terms = []
    for doc in corpus:
        doc_terms = [0] * features
        if len(doc) > 0:
            row += 1
            for term in doc:
                doc_terms[term[0]] = term[1]
            set_doc_terms.append(doc_terms)
    matrix = open(os.getcwd() + "\\matrix.txt", "w")
    for line in set_doc_terms:
        for i in range(len(line)):
            matrix.write(str(line[i]) + " ")
        matrix.write("\n")
    matrix.close()
Exemplo n.º 2
0
def get_combination():
    print "run_combination"
    # Google data
    # parser = GoogleNewsParser.NewsParsers()
    # parser.parse_data_from_tok()

    # Json Google
    tfidf = ExTFIDF.TfIdf()
    # tfidf.fit_data(parser.get_texts())
    tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters"))
    tf_vectors = tfidf.get_data_as_vector()
    print "Length of tfidf feature: " + str(len(tf_vectors[0]))
    # print tf_vectors[0]

    pairs = load_d2v()
    single = pairs[1]
    print "Length of doc2vec feature: " + str(len(single[0]))
    # print single[0]

    final = numpy.hstack((tf_vectors, single))
    print "Length of final features: " + str(len(final[0]))
    # final = []
    # for i in range(length):
    #     temp = tf_vectors[i] + single[i]
    #     final.append(temp)
    return final
Exemplo n.º 3
0
def algorithm_tfidf():
    print "Running TFIDF"
    # Google data
    # parser = GoogleNewsParser.NewsParsers()
    # parser.parse_data_from_tok()

    # Json Google
    tfidf = ExTFIDF.TfIdf()
    # tfidf.fit_data(parser.get_texts())
    tfidf.fit_data(JsonParser.get_texts(os.getcwd() + "\\" + "clusters"))

    print "lennth of tfidf : " + str(len(tfidf.get_data_as_vector()))

    print "Running algorithm with TFIDF"
    Algorithm.algorithm_Kmean(tfidf.get_data_as_vector())