예제 #1
0
파일: nimbus.py 프로젝트: pscohn/nimbus
def generate():
    shutil.rmtree(config['default']['site_path'])
    posts = reader.read_files(config['default']['posts_path'], 'post')
    pages = reader.read_files(config['default']['pages_path'], 'page')
    generate_posts(posts, pages)
    generate_pages(pages, posts)
    generate_index(pages, posts)
    generate_categories(pages, posts)
    generate_feed(posts)
    utils.printnum(len(posts), 'post')
    utils.printnum(len(pages), 'page')
예제 #2
0
#Hyperparameters
max_features = 2048
maxlen = 128
batch_size = 128
split_percentage = 80
epoch = 7
embed_dim = 128
lstm_out_space = 128
filters = 64
kernel_size = 3

#Parsing
logger.debug("[Abriendo corpus... ]")

corpus = read_files(['positive', 'negative', 'regular'])
corpus = clean_corpus(corpus)
corpus = shuffle(corpus)
newCorpus = {'text': [], 'raw': [], 'sentiment': []}

logger.debug("[Mapeando corpus... ]")
logger.debug("[El corpus tiene " + str(len(corpus)) + " rows]")
for (processed_sentence, raw_sentence, sentiment) in corpus:
    newCorpus["text"].append(" ".join(processed_sentence))
    newCorpus["raw"].append(raw_sentence)
    newCorpus["sentiment"].append(sentiment_to_vector(sentiment))

logger.debug("[Vectorizando corpus... ]")
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(newCorpus['text'])
vectorized_text = tokenizer.texts_to_sequences(newCorpus['text'])
예제 #3
0
def check(in_filename, sen_repr_path, db_path, word_repr, dictionary):
    # read the data set
    db = r.read_files_float(db_path)
    # read dictionary
    d = r.read_dictionary(dictionary, word_repr)

    # read the original sentences indices and filter them
    orig_sent = r.read_files(in_filename)
    f1_sent = f.remove_long_short_sentences(orig_sent)
    gc.collect()
    f2_sent = f.remove_unknown(f1_sent)
    gc.collect()

    # read the representations
    sen_repr = list()
    fid = open(sen_repr_path)
    lines = fid.readlines()
    fid.close()
    for i in range(len(lines)):
        sen_repr.append([i, lines[i]])

    word_test_flag = True
    sentence_test_flag = True
    log_word = ""
    log_sen = ""

    # testing
    for i in range(len(f2_sent)):
        # target_word = 0  # first word test
        target_word = len(f2_sent[i][1]) - 1  # last word

        sen_from_db = c.vector2string(db[i * 2][1][1:1001])
        w_from_db = c.vector2string(db[i * 2][1][1001:2001])

        w_target = c.vector2string(d[f2_sent[i][1][target_word] - 1][1])
        sen_target = c.vector2string([float(x) for x in sen_repr[f2_sent[i][0]][1].split()])

        if w_from_db != w_target:
            log_word += "From DB: " + w_from_db + "\n"
            log_word += "Target: " + w_target + "\n\n"
            word_test_flag = False

        if sen_from_db != sen_target:
            log_sen += "From DB: " + sen_from_db + "\n"
            log_sen += "Target: " + sen_target + "\n\n"
            sentence_test_flag = False

    # test summary
    if sentence_test_flag and word_test_flag:
        print "Test pass!"
    elif not sentence_test_flag and word_test_flag:
        print "Word test pass, sentence test failed."
        print log_sen
    elif sentence_test_flag and not word_test_flag:
        print "Sentence test pass, word test failed."
        print log_word
    else:
        print "Both sentence and word tests failed."
        print "SENTENCE:"
        print log_sen
        print "WORD:"
        print log_word
예제 #4
0
    if (positive > neutral and positive > negative):
        return 'positive'
    if (neutral > positive and neutral > negative):
        return 'neutral'
    if (negative > neutral and negative > positive):
        return 'negative'


def promedio(prediction):
    total = 0
    for e in prediction:
        total += e[0]
    return total / len(prediction)


corpus = read_files(['positive_long', 'negative_long', 'regular_long'])
corpus = clean_corpus(corpus)
corpus = shuffle(corpus)
newCorpus = {}
newCorpus["text"] = []
newCorpus["raw"] = []
newCorpus["sentiment"] = []

max_features = 20000

selector = selector_factory_by_label(max_features)
best_tokens = select_best_tokens(corpus, selector)

feature_extractor = sentence_to_best_tokens(best_tokens)

logger.log("Mapeando corpus")
예제 #5
0
#!/usr/bin/python3

# LP-Trab4-PYTHON: Solving the point grouping problem with the leader algorithm
#
# Alan Herculano Diniz
#
# main.py: program's entry point

import sys
import reader
import leader

# Getting the command line arguments with the input filepaths:
if (len(sys.argv) == 1):
    points_file = "entrada.txt"
    dist_file = "distancia.txt"
else:  # Avoiding null filepaths
    points_file = sys.argv[1]
    dist_file = sys.argv[2]

# Reading the input files:
points, dist = reader.read_files(points_file, dist_file)

# Calculating the algorithm results:
sse, groups = leader.calculate_results(dist, points)

# Printing the algorithm results:
reader.print_results(sse, groups)
    )
    parser.add_argument(
        "in_filename", help="The path to the train/test/val file, it should be in index format not" " exact words"
    )
    parser.add_argument("out_filename", help="The output path should be dir")
    parser.add_argument("file_name", help="the file name to be created for each test")
    parser.add_argument(
        "--words_repr", help="The path to the words representation file", default="../data/enc_dec_100/word_rep.txt"
    )
    parser.add_argument("--dictionary", help="The path to the dictionary", default="../data/orig/dictionary.txt")
    args = parser.parse_args()

    dictionary = r.read_dictionary(args.dictionary, args.words_repr)
    print "Dictionary size is: ", len(dictionary)

    sent = r.read_files(args.in_filename)
    print "Number of original sentences is: ", len(sent)

    # =========== FIRST WORD =========== #
    print ("\nCreate first word db ...")
    first_word_path = args.out_filename + "first_word/"
    first_word_filename = first_word_path + args.file_name
    if not os.path.exists(first_word_path):
        os.mkdir(args.out_filename + "first_word")
    db.create_first_word_db(first_word_filename, sent)
    print ("Done.")
    # ================================== #

    # ============ LAST WORD =========== #
    print ("\nCreate last word db ...")
    last_word_path = args.out_filename + "last_word/"