示例#1
0
import library as lib
import requests
import json
import numpy as np
import time
import re

total_docs = 0
content_length = 0
for epoch in range(60):
    ids = lib.load_from_file('epochs/archive/ids-epoch-' + str(epoch) + '.txt')
    titles = lib.load_from_file('epochs/archive/titles-epoch-' + str(epoch) +
                                '.txt')
    contents = lib.load_from_file('epochs/archive/contents-epoch-' +
                                  str(epoch) + '.txt')
    ids2 = []
    titles2 = []
    contents2 = []

    for i in range(len(ids)):
        if len(contents[i]) > 2000:
            ids2.append(ids[i])
            titles2.append(titles[i])
            contents2.append(contents[i])

    # print(max([len(content) for content in contents]))
    selection = contents2[500:]
    content_length += sum([len(content) for content in selection])
    total_docs += len(selection)
    print(len(selection))
示例#2
0
def main(args_obj):

    # Get directory to add data to
    attempt = 0
    while attempt < 1000:
        log_dir = Path('epochs/Attempt (' + str(args_obj['numberOfTopics']) +
                       'T)_' + str(attempt))
        if not log_dir.exists():
            break
        attempt += 1
    log_dir = str(log_dir)
    os.mkdir(log_dir)

    # Calc no of epochs
    epochs = int(args_obj['sizeOfCorpus'] / args_obj['sizeOfBatch'])

    # Get the index of the most recent article
    # total_articles = requests.get('http://localhost:5000/api/articles/total')

    # # Initialise the Online LDA model
    modelA = lib.OnlineLDA(args_obj['vocabulary'], args_obj['sizeOfCorpus'],
                           args_obj['sizeOfBatch'], args_obj['numberOfTopics'],
                           args_obj['tau'], args_obj['kappa'],
                           args_obj['alpha'], args_obj['eta'])
    # modelB = lib.OnlineLDA(args_obj['vocabulary'], args_obj['sizeOfCorpus'], args_obj['sizeOfBatch'],  args_obj['numberOfTopics'])

    # Initialise the Vanilla LDA model
    # modelA = lib.BasicLDA(args_obj['vocabulary'], args_obj['sizeOfCorpus'], args_obj['numberOfTopics'])
    # modelB = lib.BasicLDA(args_obj['vocabulary'], args_obj['sizeOfCorpus'], args_obj['numberOfTopics'])

    # Keep track of elbo progression
    elbosA = []

    # Setup plotting grid
    fig, ax = plt.subplots()

    # Get articles and perform batch learning
    start = time.time()
    for round in range(2):
        # id = 66550
        print(
            '-----------------------------------------------------------------------------------------'
        )
        print(
            "                                         ROUND %s                                         "
            % str(round + 1))
        print(
            '-----------------------------------------------------------------------------------------'
        )

        # Prepare testing docs
        testing_contents = []
        for epoch in range(epochs):
            testing_contents += lib.load_from_file(
                'epochs/testing_contents-epoch-' + str(epoch) + '.txt')[:2]
        (docs_test_obs_ids, docs_test_obs_cts, docs_test_ho_ids,
         docs_test_ho_cts) = lib.split_test_docs(testing_contents,
                                                 modelA._vocab)

        for epoch in range(epochs):
            # if (epoch % 5 == 4):
            print("Epoch: " + str(epoch + 1) + "/" + str(epochs))

            # Get subsequent 1000 articles
            # raw_articles = json.loads(requests.get('http://localhost:5000/api/articles/1000/' + str(id)).text)['articles']
            # ids = [ article['_id'] for article in raw_articles]
            # titles = [ article['title'] for article in raw_articles]
            # contents = [ lib.prep_doc(article['content']) for article in raw_articles]
            ids = lib.load_from_file('epochs/ids-epoch-' + str(epoch) + '.txt')
            titles = lib.load_from_file('epochs/titles-epoch-' + str(epoch) +
                                        '.txt')
            contents = lib.load_from_file('epochs/contents-epoch-' +
                                          str(epoch) + '.txt')

            # randomise order of the articles
            if (round == 0):
                orderA = np.random.permutation(len(contents))
                np.save(
                    'epochs/Attempt (' + str(args_obj['numberOfTopics']) +
                    'T)_' + str(attempt) + '/AttemptA-order-' +
                    str(args_obj['numberOfTopics']) + '-epoch-' + str(epoch) +
                    '.npy', orderA)
            else:
                orderA = np.load('epochs/Attempt (' +
                                 str(args_obj['numberOfTopics']) + 'T)_' +
                                 str(attempt) + '/AttemptA-order-' +
                                 str(args_obj['numberOfTopics']) + '-epoch-' +
                                 str(epoch) + '.npy')

            # Randomly shuffle order
            contents = list(np.array(contents)[orderA])

            # # Run VI
            # gamA, lamA = modelA.update_params_VI(contents)
            # gamB, lamB = modelA.update_params_VI(contents)

            # # Externally save document params
            # np.save('epochs/Attempt (' + str(args_obj['numberOfTopics']) + 'T)_' + str(attempt) + '/AttemptA-gam-' + str(args_obj['numberOfTopics']) + '-epoch-' + str(epoch) +'.npy', gamA)
            # np.save('epochs/Attempt (' + str(args_obj['numberOfTopics']) + 'T)_' + str(attempt) + '/AttemptB-gam-' + str(args_obj['numberOfTopics']) + '-epoch-' + str(epoch) +'.npy', gamB)

            # # Externally save topic params
            # np.save('epochs/Attempt (' + str(args_obj['numberOfTopics']) + 'T)_' + str(attempt) + '/AttemptA-lam-' + str(args_obj['numberOfTopics']) + '.npy', modelA._lambda)
            # np.save('epochs/Attempt (' + str(args_obj['numberOfTopics']) + 'T)_' + str(attempt) + '/AttemptB-lam-' + str(args_obj['numberOfTopics']) + '.npy', modelB._lambda)

            # Run SVI
            if (round == 0):
                # Run SVI
                gamA, lamA, elboA = modelA.update_params_batch_SVI(
                    contents, calc_elbo=False)
                # gamB, lamB = modelA.update_params_batch_SVI(contents)

                # Keep track of elbo
                elbosA.append(
                    modelA.approx_log_pred(docs_test_obs_ids,
                                           docs_test_obs_cts, docs_test_ho_ids,
                                           docs_test_ho_cts))
                # elbosA.append(elboA)

                # Externally save topic params
                np.save(
                    'epochs/Attempt (' + str(args_obj['numberOfTopics']) +
                    'T)_' + str(attempt) + '/AttemptA-lam-' +
                    str(args_obj['numberOfTopics']) + '.npy', modelA._lambda)
                # np.save('epochs/Attempt (' + str(args_obj['numberOfTopics']) + 'T)_' + str(attempt) + '/AttemptB-lam-' + str(args_obj['numberOfTopics']) + '.npy', modelB._lambda)
            elif (round == 1):
                # Get topic assignments using the current lambda (no update)
                gamA, lamA, elboA = modelA.update_params_batch_SVI(
                    contents, no_update=True, calc_elbo=False)
                # gamB, lamB = modelA.update_params_batch_SVI(contents, no_update=True)

            # Externally save document params
            np.save(
                'epochs/Attempt (' + str(args_obj['numberOfTopics']) + 'T)_' +
                str(attempt) + '/AttemptA-gam-' +
                str(args_obj['numberOfTopics']) + '-epoch-' + str(epoch) +
                '.npy', gamA)
            # np.save('epochs/Attempt (' + str(args_obj['numberOfTopics']) + 'T)_' + str(attempt) + '/AttemptB-gam-' + str(args_obj['numberOfTopics']) + '-epoch-' + str(epoch) +'.npy', gamB)

            # if (round == 0):
            #     # externally save the titles
            #     with open('epochs/titles-epoch-' + str(epoch) + '.txt', 'w') as f:
            #         for title in titles:
            #             f.write("%s\n" % re.sub(r'[^A-Za-z ]', ' ', title))

            #     # externally save the content
            #     with open('epochs/contents-epoch-' + str(epoch) + '.txt', 'w') as f:
            #         for content in contents:
            #             f.write("%s\n" % content)

            #     # externally save the ids
            #     with open('epochs/ids-epoch-' + str(epoch) + '.txt', 'w') as f:
            #         for _id in ids:
            #             f.write("%s\n" % _id)

            # update id for next iter
            # id = ids[-1] - 1

            if (round == 0 and epoch > 0):
                # Reset grid
                ax.cla()
                # ax.set(xlabel='epoch', ylabel='elbo', title="K=%s tau=%s kappa=%s alpha=%s eta=%s" % (str(args_obj['numberOfTopics']), str(args_obj['tau']), str(args_obj['kappa']), str(args_obj['alpha']), str(args_obj['eta']) ) )
                ax.set(xlabel='epoch',
                       ylabel='log pred',
                       title="K=%s tau=%s kappa=%s alpha=%s eta=%s" %
                       (str(args_obj['numberOfTopics']), str(args_obj['tau']),
                        str(args_obj['kappa']), str(
                            args_obj['alpha']), str(args_obj['eta'])))
                ax.grid()

                # Plot elbo
                x = range(len(elbosA))
                y = elbosA
                ax.plot(x, y)

                # trendline
                z = np.polyfit(x, y, 1)
                p = np.poly1d(z)
                ax.plot(p(x), "r--")

                # externally save figure
                fig.savefig('epochs/Attempt (' +
                            str(args_obj['numberOfTopics']) + 'T)_' +
                            str(attempt) + '/AttemptA-elbo_logpred-' +
                            str(args_obj['numberOfTopics']) + '.png')
                # plt.show()

            print("Elapsed Time: " + str(time.time() - start))
示例#3
0
            print("Elapsed Time: " + str(time.time() - start))


# Programme ------------------------------------------------------------------------------

epochs = 60
batch_size = 500
num_topics_sizes = [50, 100]
alphas = [1, 5, 10]
etas = [1, 0.5, 0.01]

for eta in etas:
    for alpha in alphas:
        for num_topics in num_topics_sizes:
            args_obj = {
                'vocabulary': lib.load_from_file('vocab/badwords.txt'),
                'sizeOfCorpus': epochs * batch_size,
                'sizeOfBatch': batch_size,
                'numberOfTopics': num_topics,
                'tau': 4096,
                'kappa': 1,
                'alpha': alpha,  # doc-topic
                'eta': eta  # topic-word
            }

            print(
                '-----------------------------------------------------------------------------------------'
            )
            print("no. of topics:", num_topics)
            print("alpha:", alpha)
            print("eta:", eta)
示例#4
0
# import library
# import library as mylib
from library import list_by_tags, load_from_file

if __name__=="__main__":
    library = load_from_file("library.csv")
    for book in library:
        book[2] = book[2].split(",")
        book[3] = book[3].split(",")
    print(library)
    list_by_tags(library)
示例#5
0
# print(lib.dirichlet_variance(lam))
# print(lib.dirichlet_entropy(lam))

epoch = 0
total_num_epochs = 60
attempt = 5
num_topics = 50

_dir = 'epochs/good/'

def shuffle(list_array, order):
    return list(np.array(list_array)[order])

# Setting up the parameters for analysis
orderA = np.load(_dir + 'Attempt (' + str(num_topics) + 'T)_' + str(attempt) + '/AttemptA-order-' + str(num_topics) + '-epoch-' + str(epoch) +'.npy')
vocab = lib.load_from_file('vocab/goodwords.txt')
titles = shuffle(lib.load_from_file('epochs/titles-epoch-' + str(epoch) + '.txt'), orderA)
contents = shuffle(lib.load_from_file('epochs/contents-epoch-' + str(epoch) + '.txt'), orderA)
lamA = np.load(_dir + 'Attempt (' + str(num_topics) + 'T)_' + str(attempt) + '/AttemptA-lam-' + str(num_topics) + '.npy') 
gamA = np.load(_dir + 'Attempt (' + str(num_topics) + 'T)_' + str(attempt) + '/AttemptA-gam-' + str(num_topics) + '-epoch-' + str(epoch) +'.npy')
# lamB = np.load(_dir + 'Attempt (' + str(num_topics) + 'T)_' + str(attempt) + '/AttemptB-lam-' + str(num_topics) + '.npy')
# gamB = np.load(_dir + 'Attempt (' + str(num_topics) + 'T)_' + str(attempt) + '/AttemptB-gam-' + str(num_topics) + '-epoch-' + str(epoch) +'.npy')
K = lamA.shape[0]
batch_size = gamA.shape[0]
# batch_size = 500

# Initialising the model
modelA = lib.OnlineLDA(vocab, batch_size*total_num_epochs, batch_size,  K)
modelA._lambda = lamA
# modelB = lib.OnlineLDA(vocab, batch_size*total_num_epochs, batch_size,  K)
# modelB._lambda = lamB
示例#6
0
import sys
sys.path.append('../')
import library as lib
import numpy as np
import time
import json
import matplotlib.pyplot as plt
import re

ids = lib.load_from_file('epochs/ids-epoch-' + str(0) + '.txt')
titles = lib.load_from_file('epochs/titles-epoch-' + str(0) + '.txt')
contents = lib.load_from_file('epochs/contents-epoch-' + str(0) + '.txt')
feeds = lib.load_from_file('epochs/feeds-epoch-' + str(0) + '.txt')
dates = lib.load_from_file('epochs/dates-epoch-' + str(0) + '.txt')

feed_details = json.loads(lib.load_from_file('epochs/feeddetails.txt')[0])


def get_details(_id):
    for feed in feed_details:
        if feed['_id'] == _id:
            return re.sub(r'\?.*', '', feed['siteUrl'])
            # return feed['title']


def month(date):
    return date[:7]


for epoch in range(1, 60):
    ids += lib.load_from_file('epochs/ids-epoch-' + str(epoch) + '.txt')