def main():
    np.random.seed(42)
    data_dir_path = './data'
    very_large_data_dir_path = './very_large_data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv("dcr Man_Cleaned.csv")

    print('extract configuration from input texts ...')
    Y = df.Title
    X = df['Joined']
    config = fit_text(X, Y)

    print('configuration extracted from input texts ...')

    summarizer = Seq2SeqGloVeSummarizer(config)
    summarizer.load_glove(very_large_data_dir_path)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path(model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

    print('training size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20, batch_size=16)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeSummarizer.model_name + '-history-v' + str(summarizer.version) + '.png'
    plot_and_save_history(history, summarizer.model_name, history_plot_file_path, metrics={'loss', 'acc'})
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    Y = df.title
    X = df['text']

    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
def train():
    LOAD_EXISTING_WEIGHTS = False
    LOAD_DFARTICLES = True

    np.random.seed(42)
    report_dir_path = 'reports'
    model_dir_path = 'models'

    print('loading training data')
    if not LOAD_DFARTICLES:
        df = pd.DataFrame(columns=['abstract', 'text'])
        i = 0
        for article in get_articles(year=2017):
            print(i)
            tempDF = pd.DataFrame({
                'abstract': [article['description']],
                'text': [article['fullText']]
            })
            df = df.append(tempDF, ignore_index=True)
            if i % 10 == 0:
                with open('dfArticles2017.pkl', 'wb') as f:
                    print("dumpin time")
                    pickle.dump([df, i], f)
            # if i >= 100:
            #     break
            i += 1
    else:
        pickle_in = open("dfArticles2017.pkl", "rb")
        asdf = pickle.load(pickle_in)
        df = asdf[0]
        i = asdf[1]

    print('extract configuration from input texts ...')
    Y = df.abstract
    X = df['text']

    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)

    history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '\\' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
示例#4
0
def main():
    np.random.seed(42)
    data_dir_path = 'demo/data'
    report_dir_path = 'demo/reports'
    model_dir_path = 'demo/models'

    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    print('extract configuration from input texts ...')
    Y = df.title
    X = df['text']

    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain,
                             Ytrain,
                             Xtest,
                             Ytest,
                             epochs=100,
                             model_dir_path=model_dir_path)
示例#5
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    # df = df.loc[df.index < 1000]

    print('extract configuration from input texts ...')
    Y = df.title
    X = df['text']
    config = fit_text(X, Y)

    print('configuration extracted from input texts ...')

    summarizer = RecursiveRNN2(config)

    if LOAD_EXISTING_WEIGHTS:
        weight_file_path = RecursiveRNN2.get_weight_file_path(
            model_dir_path=model_dir_path)
        summarizer.load_weights(weight_file_path=weight_file_path)

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain,
                             Ytrain,
                             Xtest,
                             Ytest,
                             epochs=20,
                             batch_size=256)

    history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + RecursiveRNN2.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
示例#6
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')

    with open(data_dir_path + '/summary2.pkl', 'rb') as f:
        list_of_summaries = pickle.load(f)
    with open(data_dir_path + '/text2.pkl', 'rb') as f:
        list_of_text = pickle.load(f)

    # df = df.loc[df.index < 1000]
    X = list_of_text
    Y = list_of_summaries
    config = fit_text(X, Y)

    print('configuration extracted from input texts ...')

    summarizer = RecursiveRNN1(config)

    if LOAD_EXISTING_WEIGHTS:
        weight_file_path = RecursiveRNN1.get_weight_file_path(
            model_dir_path=model_dir_path)
        summarizer.load_weights(weight_file_path=weight_file_path)

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=20)

    history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + RecursiveRNN1.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
示例#7
0
def main():
    np.random.seed(
        42
    )  # seed( ) 用于指定随机数生成时所用算法开始的整数值,如果使用相同的seed( )值,则每次生成的随即数都相同,如果不设置这个值,则系统根据时间来自己选择这个值,此时每次生成的随机数因时间差异而不同。
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/chinese_data.csv")

    print('extract configuration from input texts ...')
    Y = df.title
    X = df['text']

    config = fit_text(
        X, Y
    )  # call line7 from keras_text_summarization.library.applications.fake_news_loader import fit_text

    summarizer = Seq2SeqSummarizer(config)  # 將config回傳參數放入seq2seq.py程式中

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=10)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
示例#8
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    print('extract configuration from input texts ...')
    with open(data_dir_path + '/train_preprocessed.en') as f:
        X = f.read().split('\n')

    with open(data_dir_path + '/train_preprocessed.de') as f:
        Y = f.read().split('\n')
    config = fit_text(X, Y)

    summarizer = Seq2SeqSummarizer(config)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('demo size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')
    history = summarizer.fit(Xtrain, Ytrain, Xtest, Ytest, epochs=100)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})
示例#9
0
def main():
    np.random.seed(42)
    data_dir_path = './data'
    report_dir_path = './reports'
    model_dir_path = './models'

    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/news_summary.csv", encoding='cp437')

    df = df.dropna()
    df = df.drop(['date', 'headlines', 'read_more'], 1)
    df = df.set_index('author')
    df = df.reset_index(drop=True)

    print('extract configuration from input texts ...')
    Y = df.text
    X = df.ctext

    config = fit_text(X, Y)
    num_input_tokens = config['num_input_tokens']
    print('num is' + len(num_input_tokens))
示例#10
0
def main():
    data_dir_path = './data'

    # Import `fake_or_real_news.csv`
    df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    # Inspect shape of `df`
    print(df.shape)

    # Print first lines of `df`
    print(df.head())

    # Set index
    df = df.set_index("Unnamed: 0")

    # Print first lines of `df`
    print(df.head())

    # Set `y`
    Y = df.title
    X = df['text']

    # Drop the `label` column
    df.drop("title", axis=1)

    # Make training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.33,
                                                        random_state=53)

    print('X train: ', X_train.shape)
    print('Y train: ', y_train.shape)

    config = fit_text(X, Y)

    print('num_input_tokens: ', config['num_input_tokens'])
    print('num_target_tokens: ', config['num_target_tokens'])
    print('max_input_seq_length: ', config['max_input_seq_length'])
    print('max_target_seq_length: ', config['max_target_seq_length'])
示例#11
0
def main():
    np.random.seed(42)
    data_dir_path = './demo/data'
    very_large_data_dir_path = './very_large_data'
    report_dir_path = './reports'
    model_dir_path = './models'
    '''filenames = load_data(data_dir_path, data_categories[0])
    print(len(filenames))
    data = {'articles': [], 'summaries': []}
    i =-1
    for x in sorted(filenames):
        i +=1
        if i%2 == 0:
            filename = x.split('.')[0]

            if os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.summ') and os.path.exists(data_dir_path+data_categories[0]+'/'+filename+'.sent'):
                try:
                    data['articles'].append(cleantext(parsetext(data_dir_path,data_categories[0],"{}".format(filename+'.sent'))))
                    data['summaries'].append(
                        cleantext(parsetext(data_dir_path, data_categories[0], "{}".format(filename + '.summ'))))
                except Exception as e:
                    print(e)
        else:
            continue

    # OBSOLETE
    # with open('deepmind_news_training.pickle', 'wb') as handle:
    #    pickle.dump(data,handle)
    dd.io.save('deepmind_training.h5',{'articles':data['articles'], 'summaries':data['summaries']},compression=None)
    print(len(data['articles']))
    print(len(data['summaries']))

    exit(0)'''

    #data = dd.io.load('deepmind_training.h5')
    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/Reviews.csv").dropna()
    X = np.array(df['Text'].values)
    Y = np.array(df['Summary'].values)

    #with open('deepmind_news_training.pickle', 'rb') as handle:
    #    data = pickle.load(handle)

    # print('loading csv file ...')
    #df = pd.read_csv(data_dir_path + "/fake_or_real_news.csv")

    print('extract configuration from input texts ...')
    #Y = df.title
    #X = df['text']
    #Y = data['summaries'][:1000]
    #X = data['articles'][:1000]
    #del data

    config = fit_text(X, Y)
    print(config['max_target_seq_length'])
    print(config['max_input_seq_length'])
    print('configuration extracted from input texts ...')

    summarizer = Seq2SeqGloVeAttentionSummarizer(config, lr=1e-3)
    summarizer.load_glove(very_large_data_dir_path)

    if LOAD_EXISTING_WEIGHTS:
        summarizer.load_weights(
            weight_file_path=Seq2SeqGloVeSummarizer.get_weight_file_path(
                model_dir_path=model_dir_path))

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)
    print(Xtrain.shape)
    print('training size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')

    history = summarizer.fit(Xtrain,
                             Ytrain,
                             Xtest,
                             Ytest,
                             epochs=500,
                             batch_size=30)

    history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history.png'
    if LOAD_EXISTING_WEIGHTS:
        history_plot_file_path = report_dir_path + '/' + Seq2SeqGloVeAttentionSummarizer.model_name + '-history-v' + str(
            summarizer.version) + '.png'
    plot_and_save_history(history,
                          summarizer.model_name,
                          history_plot_file_path,
                          metrics={'loss', 'acc'})

    rouge = Rouge()
    scores = rouge.get_scores(hyps=summarizer.summarize(df['Text'][0]),
                              refs=df['Text'][0])
    print(scores)
    #print(df['Text'][0])
    for i in range(10):
        print(summarizer.summarize(df['Text'][i]))
    print("=====================")

    for i in range(10):
        print(df['Summary'][i])
    exit(0)
示例#12
0
def main():
    np.random.seed(42)
    data_dir_path = './demo/data'
    very_large_data_dir_path = './very_large_data'
    report_dir_path = './reports'
    model_dir_path = './models'
    print('loading csv file ...')
    df = pd.read_csv(data_dir_path + "/Reviews.csv").dropna()
    X = df['Text']
    Y = df['Summary']
    print(len(X))
    '''fp = open('deepmind_news_training.pickle','rb')
    data = pickle.load(fp)
    fp.close()
    X = data['articles'][:500]
    Y = data['summaries'][:500]'''
    '''for i, value in enumerate(X):
        X[i] = cleantext(str(value))
    for i,value in enumerate(Y):
        Y[i] = cleantext(str(value))
    '''
    '''articles = glob.glob('BBC_1/Articles/*')
    summaries = glob.glob('BBC_1/Summaries/*')

    documents = []
    sums = []
    titles = []
    for folder in articles:
        docs = glob.glob(folder + '/*')
        #summaries path from doc's
        sumpath = folder.split(sep='/')
        sumpath = sumpath[0] + '/Summaries/' + sumpath[2]

        sumpaths = glob.glob(sumpath + '/*')
        for i,article in enumerate(docs):
            try:
                with open(article,'r') as fp:
                    d=fp.readlines()
                    with open(sumpaths[i],'r') as sp:
                        s = sp.read()

                        documents.append(''.join(d[1:]))
                        sums.append(s)
                        titles.append(d[0])

            except Exception as e:
                print('{}'.format(i))
                continue

    X = np.array(documents)
    Y = np.array(titles)
    print('X: {} Y:{}'.format(X.shape,Y.shape))
    print(titles[0])
    print(documents[0])'''
    config = fit_text(X, Y)
    #print(config)

    # Preparing GloVe
    '''embeddings_index = {}
    f = open(os.path.join(very_large_data_dir_path, 'glove.6B.{}d.txt'.format(EMBEDDING_DIM)))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    embedding_matrix = np.zeros((len(config['input_word2idx']) , EMBEDDING_DIM), dtype='float32')
    for word, i in config['input_word2idx'].items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in glove will be zeros
            embedding_matrix[i] = embedding_vector
    print('Embedding Matrix: {}'.format(embedding_matrix.shape))

    embedding_matrix_target = np.zeros((len(config['target_word2idx']), EMBEDDING_DIM), dtype='float32')
    for word, i in config['target_word2idx'].items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in glove will be zeros
            embedding_matrix_target[i] = embedding_vector
    print('Embedding Matrix: {}'.format(embedding_matrix_target.shape))'''

    summarizer = RecursiveRNN3(config=config)

    if LOAD_EXISTING_WEIGHTS:
        weight_file_path = RecursiveRNN3.get_weight_file_path(
            model_dir_path=model_dir_path)
        print('Loading Weights:' + weight_file_path)
        summarizer.load_weights(weight_file_path=weight_file_path)
    #summarizer.load_glove(very_large_data_dir_path)
    '''vocabulary_size = 100
    tokenizer = Tokenizer(num_words=vocabulary_size, lower=True)
    tokenizer.fit_on_texts()

    Xseq = tokenizer.texts_to_sequences(X)
    Yseq = tokenizer.texts_to_sequences(Y)
    Xf = pad_sequences(Xseq, maxlen=MAX_INPUT_LENGTH)
    Yf = pad_sequences(Yseq, maxlen=MAX_OUTPUT_LENGTH)'''

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)
    history = summarizer.fit(Xtrain,
                             Ytrain,
                             Xtest,
                             Ytest,
                             epochs=20,
                             batch_size=32)
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig('accuracy.png')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig('loss.png')
    plt.show()

    for i in range(10):
        print(Xtrain[i])
        print('-------')
        print(summarizer.summarize(Xtrain[i]))
        print(Ytrain[i])
        print("=====")
    '''