Exemplo n.º 1
0
    def detect_events(self,
                      index,
                      doc_field,
                      max_perc_words_by_topic,
                      logger,
                      time_slice_length,
                      k=10,
                      rel_words_per_event=5,
                      theta=0.6,
                      sigma=0.5):

        # vocabulary = self.get_vocabulary(index, doc_field, max_perc_words_by_topic)
        # corpus = TobasCorpus(vocabulary=vocabulary)  # text timestamp_ms (must be a date object)
        print("Getting the vocabulary-based tweets")
        vocabulary_tweets = self.get_vocabulary_tweets(
            index, doc_field, max_perc_words_by_topic)
        print("Setting the corpus")
        self.corpus = TobasCorpus(
            tweets=vocabulary_tweets
        )  # text timestamp_ms (must be a date object)
        print("Discretizing the corpus")
        self.corpus.discretize(time_slice_length, logger=logger)

        print("Running MABED phase 1")
        mabed = MABED(self.corpus, logger)
        self.rel_words_per_event = rel_words_per_event
        self.p = rel_words_per_event  # since some inherited methods need it with this name
        self.theta = theta
        self.sigma = sigma
        basic_events = mabed.phase1()
        print("Running MABED phase 2")
        final_events = self.phase2(basic_events)
        print("Events", final_events)

        return final_events
Exemplo n.º 2
0
    def detect_filtered_events(self,
                               index="test3",
                               k=10,
                               maf=10,
                               mrf=0.4,
                               tsl=30,
                               p=10,
                               theta=0.6,
                               sigma=0.6,
                               session=False,
                               filter=False,
                               cluster=2):
        sw = 'stopwords/twitter_all.txt'
        sep = '\t'
        print('Parameters:')
        print(
            '   Index: %s\n   k: %d\n   Stop-words: %s\n   Min. abs. word frequency: %d\n   Max. rel. word frequency: %f'
            % (index, k, sw, maf, mrf))
        print('   p: %d\n   theta: %f\n   sigma: %f' % (p, theta, sigma))

        print('Loading corpus...')
        start_time = timeit.default_timer()
        my_corpus = Corpus(sw,
                           maf,
                           mrf,
                           sep,
                           index=index,
                           session=session,
                           filter=filter)
        if not my_corpus.tweets:
            return False

        elapsed = timeit.default_timer() - start_time
        print('Corpus loaded in %f seconds.' % elapsed)

        time_slice_length = tsl
        print('Partitioning tweets into %d-minute time-slices...' %
              time_slice_length)
        start_time = timeit.default_timer()
        my_corpus.discretize(time_slice_length, cluster)
        elapsed = timeit.default_timer() - start_time
        print('Partitioning done in %f seconds.' % elapsed)

        print('Running MABED...')
        start_time = timeit.default_timer()
        mabed = MABED(my_corpus)
        mabed.run(k=k, p=p, theta=theta, sigma=sigma)
        elapsed = timeit.default_timer() - start_time
        print('Event detection performed in %f seconds.' % elapsed)
        return mabed
Exemplo n.º 3
0
    start_time = timeit.default_timer()
    my_corpus = Corpus(args.i, args.sw, args.maf, args.mrf)
    elapsed = timeit.default_timer() - start_time
    print('Corpus loaded in %f seconds.' % elapsed)

    time_slice_length = args.tsl
    print('Partitioning tweets into %d-minute time-slices...' %
          time_slice_length)
    start_time = timeit.default_timer()
    my_corpus.discretize(time_slice_length)
    elapsed = timeit.default_timer() - start_time
    print('Partitioning done in %f seconds.' % elapsed)

    print('Running MABED...')
    k = args.k
    p = args.p
    theta = args.t
    sigma = args.s
    start_time = timeit.default_timer()
    mabed = MABED(my_corpus)
    mabed.run(k=k, p=p, theta=theta, sigma=sigma)
    mabed.print_events()
    mabed.prepare_csv(my_corpus.save_start_date, time_slice_length)
    mabed.print_anomalies()
    elapsed = timeit.default_timer() - start_time
    print('Event detection performed in %f seconds.' % elapsed)

    if args.o is not None:
        utils.save_events(mabed, args.o)
        print('Events saved in %s' % args.o)
Exemplo n.º 4
0
    print('   p: %d\n   theta: %f\n   sigma: %f' % (args.p, args.t, args.s))

    print('Loading corpus...')
    start_time = timeit.default_timer()
    my_corpus = Corpus(args.i, args.sw, args.maf, args.mrf, args.sep)
    elapsed = timeit.default_timer() - start_time
    print('Corpus loaded in %f seconds.' % elapsed)

    time_slice_length = args.tsl
    print('Partitioning tweets into %d-minute time-slices...' %
          time_slice_length)
    start_time = timeit.default_timer()
    my_corpus.discretize(time_slice_length)
    elapsed = timeit.default_timer() - start_time
    print('Partitioning done in %f seconds.' % elapsed)

    print('Running MABED...')
    k = args.k
    p = args.p
    theta = args.t
    sigma = args.s
    start_time = timeit.default_timer()
    mabed = MABED(my_corpus)
    mabed.run(k=k, p=p, theta=theta, sigma=sigma)
    mabed.print_events()
    elapsed = timeit.default_timer() - start_time
    print('Event detection performed in %f seconds.' % elapsed)

    if args.o is not None:
        utils.save_events(mabed, args.o)
        print('Events saved in %s' % args.o)