Exemplo n.º 1
0
    corpus = []
    corpus_bigrams = []
    corpus_trigrams = []
    corpus_fourgrams = []
    corpus_fivegrams = []
    for doc in recurse_dir(r'./corpus', '*.txt'):
        doc_file = open(doc, 'rb')
        doc = doc_file.read()
        doc_words = []
        bi_grams = []
        tri_grams = []
        four_grams = []
        five_grams = []

        cor = Corpus(doc, doc_words, bi_grams, tri_grams, four_grams, five_grams)
        cor.generate_location_vector(cor.parse_xml(), [0])
        doc_count += 1

        doc_word_count = len(doc_words)
        doc_word_count_list.append(doc_word_count)
        corpus_words.append(doc_words)

        doc_bi_gram_count = len(bi_grams)
        doc_bi_gram_count_list.append(doc_bi_gram_count)
        corpus_bi_grams.append(bi_grams)

        doc_tri_gram_count = len(tri_grams)
        doc_tri_gram_count_list.append(doc_tri_gram_count)
        corpus_tri_grams.append(tri_grams)

        doc_four_gram_count = len(four_grams)