Exemplo n.º 1
0
 def test_Parser_data(self):
     with open(os.path.join(os.path.dirname(sys.argv[0]),
                            "test_parser.txt")) as fr:
         p = Parser(fr)
         with open(
                 os.path.join(os.path.dirname(sys.argv[0]),
                              "test_parser_dataset.txt"), "w") as fw:
             p.get_data(fw)
Exemplo n.º 2
0
def main():
    argdict = dict(zip(sys.argv, sys.argv[1:] + ['']))
    if "-h" in argdict:
        print(help_message)
        return
    
    #Set of filenames to data files.
    raw_filename = join_filenames("data", "tweets.csv")
    filtered_filename = join_filenames("data", "_tweets_filtered.txt")
    stat_filename = join_filenames("data", "tweets_stat.txt")
    tokenized_filename = join_filenames("data", "tweets_tokenized.txt")
    
    #Dimension of the model
    session_config = configparser.ConfigParser()
    session_config.read('session.ini')

    word2vec_batch_size = 640
    embedding_size = int(session_config['dimension']['embedding_size'])
    gen_batch_size = 128
    gen_seq_length = int(session_config['dimension']['gen_seq_length'])
    gen_hidden_size = [int(x) for x in session_config['dimension']['gen_hidden_size'].split(',')]

    #Hyper-parameter of the model
    learning_rate = 1E-06
    

    if "-i" in argdict:
        #Filter valid tweets from data file, and use nlp parser to tokenize tweets
        if os.path.isfile(tokenized_filename):
            proceed = (input("Erasing old data. OK to proceed? (Y/N)") == "Y")
        else:
            proceed = True
        if proceed:
            with open_utf8(raw_filename, "r") as raw_file_r:
                #Filter actual tweets
                preparser = Preparser(raw_file_r)
                preparser.extract(filter=True)
                with open_utf8(filtered_filename, "w") as filtered_file_w:
                    preparser.save(filtered_file_w)
                
                #Tokenize tweets
                with open_utf8(filtered_filename, "r") as filtered_file_r:
                    parser = Parser(filtered_file_r)
                    with open_utf8(stat_filename, "w") as stat_file_w:
                        parser.get_stats(stat_file_w)
                    with open_utf8(tokenized_filename, "w") as tokenized_file_w:
                        parser.get_data(tokenized_file_w)
                
    if "-w" in argdict and int(argdict["-w"]) >= 0:
        #Start or continue word2vec optimization
        word2vec_num_step = int(argdict["-w"])
        if "-W" in argdict:
            word2vec_save_filename = join_filenames("saves", argdict["-W"])
        else:
            word2vec_save_filename = join_filenames(
                "saves", session_config['save_file']['word2vec_save'])
        word2vec_restore = os.path.isfile(word2vec_save_filename+".meta")
        
        word2vec = Word2Vec(tokenized_filename, stat_filename)
        word2vec.give_code()
        word2vec.tf_init(embedding_size=embedding_size,
                         batch_size=word2vec_batch_size, seed=None)
        word2vec.tf_run(word2vec_num_step, word2vec_save_filename, restore=word2vec_restore)
        
        if "-g" in argdict and int(argdict["-g"]) >= 0:
        #Start or continue generator learning
            with open_utf8(stat_filename, "r") as stat_file_r, open_utf8(tokenized_filename, "r") as tokenized_file_r:
                embeddings = word2vec.Embeddings()
                if "-G" in argdict:
                    gen_save_filename = join_filenames("saves", argdict["-G"])
                else:
                    gen_save_filename = join_filenames(
                        "saves", session_config['save_file']['generator_save'])
                gen_restore = os.path.isfile(gen_save_filename+".meta")
                generator = Generator(embeddings)
                generator.nn_init(
                    gen_batch_size, gen_seq_length, gen_hidden_size,
                    learning_rate=learning_rate, seed=None,
                    use_vector=("-V" in argdict))
                generator.train_real_data(int(argdict["-g"]), tokenized_file_r,
                    gen_save_filename, restore=gen_restore)
                
                if "-s" in argdict and int(argdict["-s"]) >= 0:
                    result_filename = join_filenames(argdict["-S"])
                    unparser = Unparser(result_filename)
                    sentences = generator.generate(gen_save_filename,
                                                   int(argdict["-s"]))
                    unparser.save(sentences)