def generate(): shutil.rmtree(config['default']['site_path']) posts = reader.read_files(config['default']['posts_path'], 'post') pages = reader.read_files(config['default']['pages_path'], 'page') generate_posts(posts, pages) generate_pages(pages, posts) generate_index(pages, posts) generate_categories(pages, posts) generate_feed(posts) utils.printnum(len(posts), 'post') utils.printnum(len(pages), 'page')
#Hyperparameters max_features = 2048 maxlen = 128 batch_size = 128 split_percentage = 80 epoch = 7 embed_dim = 128 lstm_out_space = 128 filters = 64 kernel_size = 3 #Parsing logger.debug("[Abriendo corpus... ]") corpus = read_files(['positive', 'negative', 'regular']) corpus = clean_corpus(corpus) corpus = shuffle(corpus) newCorpus = {'text': [], 'raw': [], 'sentiment': []} logger.debug("[Mapeando corpus... ]") logger.debug("[El corpus tiene " + str(len(corpus)) + " rows]") for (processed_sentence, raw_sentence, sentiment) in corpus: newCorpus["text"].append(" ".join(processed_sentence)) newCorpus["raw"].append(raw_sentence) newCorpus["sentiment"].append(sentiment_to_vector(sentiment)) logger.debug("[Vectorizando corpus... ]") tokenizer = Tokenizer(num_words=max_features) tokenizer.fit_on_texts(newCorpus['text']) vectorized_text = tokenizer.texts_to_sequences(newCorpus['text'])
def check(in_filename, sen_repr_path, db_path, word_repr, dictionary): # read the data set db = r.read_files_float(db_path) # read dictionary d = r.read_dictionary(dictionary, word_repr) # read the original sentences indices and filter them orig_sent = r.read_files(in_filename) f1_sent = f.remove_long_short_sentences(orig_sent) gc.collect() f2_sent = f.remove_unknown(f1_sent) gc.collect() # read the representations sen_repr = list() fid = open(sen_repr_path) lines = fid.readlines() fid.close() for i in range(len(lines)): sen_repr.append([i, lines[i]]) word_test_flag = True sentence_test_flag = True log_word = "" log_sen = "" # testing for i in range(len(f2_sent)): # target_word = 0 # first word test target_word = len(f2_sent[i][1]) - 1 # last word sen_from_db = c.vector2string(db[i * 2][1][1:1001]) w_from_db = c.vector2string(db[i * 2][1][1001:2001]) w_target = c.vector2string(d[f2_sent[i][1][target_word] - 1][1]) sen_target = c.vector2string([float(x) for x in sen_repr[f2_sent[i][0]][1].split()]) if w_from_db != w_target: log_word += "From DB: " + w_from_db + "\n" log_word += "Target: " + w_target + "\n\n" word_test_flag = False if sen_from_db != sen_target: log_sen += "From DB: " + sen_from_db + "\n" log_sen += "Target: " + sen_target + "\n\n" sentence_test_flag = False # test summary if sentence_test_flag and word_test_flag: print "Test pass!" elif not sentence_test_flag and word_test_flag: print "Word test pass, sentence test failed." print log_sen elif sentence_test_flag and not word_test_flag: print "Sentence test pass, word test failed." print log_word else: print "Both sentence and word tests failed." print "SENTENCE:" print log_sen print "WORD:" print log_word
if (positive > neutral and positive > negative): return 'positive' if (neutral > positive and neutral > negative): return 'neutral' if (negative > neutral and negative > positive): return 'negative' def promedio(prediction): total = 0 for e in prediction: total += e[0] return total / len(prediction) corpus = read_files(['positive_long', 'negative_long', 'regular_long']) corpus = clean_corpus(corpus) corpus = shuffle(corpus) newCorpus = {} newCorpus["text"] = [] newCorpus["raw"] = [] newCorpus["sentiment"] = [] max_features = 20000 selector = selector_factory_by_label(max_features) best_tokens = select_best_tokens(corpus, selector) feature_extractor = sentence_to_best_tokens(best_tokens) logger.log("Mapeando corpus")
#!/usr/bin/python3 # LP-Trab4-PYTHON: Solving the point grouping problem with the leader algorithm # # Alan Herculano Diniz # # main.py: program's entry point import sys import reader import leader # Getting the command line arguments with the input filepaths: if (len(sys.argv) == 1): points_file = "entrada.txt" dist_file = "distancia.txt" else: # Avoiding null filepaths points_file = sys.argv[1] dist_file = sys.argv[2] # Reading the input files: points, dist = reader.read_files(points_file, dist_file) # Calculating the algorithm results: sse, groups = leader.calculate_results(dist, points) # Printing the algorithm results: reader.print_results(sse, groups)
) parser.add_argument( "in_filename", help="The path to the train/test/val file, it should be in index format not" " exact words" ) parser.add_argument("out_filename", help="The output path should be dir") parser.add_argument("file_name", help="the file name to be created for each test") parser.add_argument( "--words_repr", help="The path to the words representation file", default="../data/enc_dec_100/word_rep.txt" ) parser.add_argument("--dictionary", help="The path to the dictionary", default="../data/orig/dictionary.txt") args = parser.parse_args() dictionary = r.read_dictionary(args.dictionary, args.words_repr) print "Dictionary size is: ", len(dictionary) sent = r.read_files(args.in_filename) print "Number of original sentences is: ", len(sent) # =========== FIRST WORD =========== # print ("\nCreate first word db ...") first_word_path = args.out_filename + "first_word/" first_word_filename = first_word_path + args.file_name if not os.path.exists(first_word_path): os.mkdir(args.out_filename + "first_word") db.create_first_word_db(first_word_filename, sent) print ("Done.") # ================================== # # ============ LAST WORD =========== # print ("\nCreate last word db ...") last_word_path = args.out_filename + "last_word/"