Пример #1
0
 def post(self):
     text = api.payload["text"]
     user = api.payload["user"]
     if len(text) > 0:
         tweet = Tweet()
         tweet.text = text
         user = db.session.query(User).filter_by(name=user).first()
         tweet.user_id = user.id
         db.session.add(tweet)
         db.session.commit()
         return tweet, 201
     else:
         return abort(422, "Tweet text can't be empty")
Пример #2
0
def create_tweet(user=None):
    if not user:
        abort(401)

    requested_object = request.get_json()
    tweet = Tweet()
    tweet.user_id = user.id
    try:
        for attr in ['text']:
            setattr(tweet, attr, requested_object[attr])
    except KeyError:
        return '', 400

    db.session.add(tweet)
    db.session.commit()

    return tweet_schema.jsonify(tweet)
Пример #3
0
def result():
    selectedChoices = ChoiceObj('attractions', session.get('selected'))
    form_splace = SearchPlaceForm()
    form_stweets = SearchTweetsForm(obj=selectedChoices)

    if form_stweets.validate_on_submit():
        session['selected'] = form_stweets.multi_attractions.data
        place_name = form_stweets.place.data
        latitude = form_stweets.latitude.data
        longitude = form_stweets.longitude.data
        attractions = session.get('selected')
        range_dist = form_stweets.range_dist.data
        days_before = form_stweets.days_before.data

        # CRAWLING
        twitter_crawler = TwitterCrawler(current_app)
        df_attractions = twitter_crawler.fetch_tweets_from_attractions(
            attractions, int(days_before), float(latitude), float(longitude),
            int(range_dist), place_name)

        # if data from crawling less than 20, return notification
        if len(df_attractions) < 20:
            return render_template('notification.html')

        # insert into crawler table
        crawler = Crawler()
        crawler.timestamp = datetime.now(pytz.timezone('Asia/Jakarta'))
        db.session.add(crawler)
        db.session.commit()

        # insert into attractions table
        attractions_lower = [x.lower() for x in attractions]
        att = Attractions()
        att.attractions = ','.join(attractions_lower)
        att.crawler_id = crawler.id
        db.session.add(att)
        db.session.commit()

        # insert into tweet table
        for _, row in df_attractions.iterrows():
            tweet = Tweet()
            tweet.user_id = row['user_id']
            tweet.username = row['username']
            tweet.created = row['created_at']
            tweet.text = row['text']
            tweet.latitude = row['latitude']
            tweet.longitude = row['longitude']
            tweet.crawler_id = crawler.id
            db.session.add(tweet)
            db.session.commit()

        # PREPROCESSING
        tweets = Tweet.query.filter_by(crawler_id=crawler.id)
        attractions = Attractions.query.filter_by(crawler_id=crawler.id)

        # change attractions into list
        list_attractions = []
        for a in attractions:
            list_attractions.append(a.attractions)

        list_attractions = ''.join(list_attractions).split(',')

        # separate text into list
        list_tweets = []
        for t in tweets:
            id_tweet = [t.id, t.text]
            list_tweets.append(id_tweet)

        # define
        normalizer = Normalize()
        tokenizer = Tokenize()
        symspell = SymSpell(max_dictionary_edit_distance=3)
        SITE_ROOT = os.path.abspath(os.path.dirname(__file__))
        json_url = os.path.join(SITE_ROOT, "..\data",
                                "corpus_complete_model.json")
        symspell.load_complete_model_from_json(json_url, encoding="ISO-8859-1")

        # do preprocess
        result = []
        for tweet in list_tweets:
            id, text = tweet[0], tweet[1]

            # normalize
            tweet_norm = normalizer.remove_ascii_unicode(text)
            tweet_norm = normalizer.remove_rt_fav(tweet_norm)
            tweet_norm = normalizer.lower_text(tweet_norm)
            tweet_norm = normalizer.remove_newline(tweet_norm)
            tweet_norm = normalizer.remove_url(tweet_norm)
            tweet_norm = normalizer.remove_emoticon(tweet_norm)
            tweet_norm = normalizer.remove_hashtag_mention(tweet_norm)
            tweet_norm = normalizer.remove_punctuation(tweet_norm)

            # tokenize
            tweet_tok = tokenizer.WordTokenize(tweet_norm, removepunct=True)

            # spell correction
            temp = []
            for token in tweet_tok:
                suggestion = symspell.lookup(phrase=token,
                                             verbosity=1,
                                             max_edit_distance=3)

                # option if there is no suggestion
                if len(suggestion) > 0:
                    get_suggestion = str(suggestion[0]).split(':')[0]
                    temp.append(get_suggestion)
                else:
                    temp.append(token)
            tweet_prepared = ' '.join(temp)

            # join attraction with strip
            tweet_prepared = normalizer.join_attraction(
                tweet_prepared, list_attractions)

            id_tweet_prepared = [id, tweet_prepared]
            result.append(id_tweet_prepared)

        # insert into table preprocess
        for res in result:
            id, text = res[0], res[1]

            tb_preprocess = Preprocess()
            tb_preprocess.text = text
            tb_preprocess.tweet_id = id
            tb_preprocess.crawler_id = crawler.id
            db.session.add(tb_preprocess)
            db.session.commit()

        # POS TAGGING
        tweets_preprocessed = Preprocess.query.filter_by(crawler_id=crawler.id)

        # get text from table Preprocess
        list_tweets = []
        for t in tweets_preprocessed:
            tid_tweet = [t.tweet_id, t.text]
            list_tweets.append(tid_tweet)

        # path
        SITE_ROOT = os.path.abspath(os.path.dirname(__file__))
        lexicon_url = os.path.join(SITE_ROOT, "..\data", "Lexicon.trn")
        ngram_url = os.path.join(SITE_ROOT, "..\data", "Ngram.trn")

        # initialize
        tagger = MainTagger(lexicon_url, ngram_url, 0, 3, 3, 0, 0, False, 0.2,
                            0, 500.0, 1)
        tokenize = Tokenization()

        # do pos tagging
        result = []
        for tweet in list_tweets:
            tweet_id, text = tweet[0], tweet[1]

            if len(text) == 0:
                tid_text = [tweet_id, text]
                result.append(tid_text)
            else:
                if len(text.split(' ')) == 1:
                    text = text + ' ini'
                out = tokenize.sentence_extraction(tokenize.cleaning(text))
                join_word = []
                for o in out:
                    strtag = " ".join(tokenize.tokenisasi_kalimat(o)).strip()
                    join_word += [" ".join(tagger.taggingStr(strtag))]
                tid_text = [tweet_id, join_word]
                result.append(tid_text)

        # insert into table preprocess
        for tweet in result:
            tweet_id, text = tweet[0], tweet[1]
            tweet_str = ''.join(text)

            tb_postag = PosTag()
            tb_postag.text = tweet_str
            tb_postag.tweet_id = tweet_id
            tb_postag.crawler_id = crawler.id
            db.session.add(tb_postag)
            db.session.commit()

        # PENENTUAN KELAS
        Ccon = ['JJ', 'NN', 'NNP', 'NNG', 'VBI', 'VBT']
        Cfunc = [
            'OP', 'CP', 'GM', ';', ':', '"', '.', ',', '-', '...', 'RB', 'IN',
            'MD', 'CC', 'SC', 'DT', 'UH', 'CDO', 'CDC', 'CDP', 'CDI', 'PRP',
            'WP', 'PRN', 'PRL', 'NEG', 'SYM', 'RP', 'FW'
        ]
        tweets_tagged = PosTag.query.filter_by(crawler_id=crawler.id)

        # get text from table PostTag
        list_tweets = []
        for t in tweets_tagged:
            tid_tweet = [t.tweet_id, t.text]
            list_tweets.append(tid_tweet)

        # do penentuan kelas
        result = []
        for tweet in list_tweets:
            tweet_id, text = tweet[0], tweet[1]

            if len(text) > 0:
                text_split = text.split(' ')

                doc_complete = {"con": [], "func": []}
                con = []
                func = []

                for word in text_split:
                    w = word.split('/', 1)[0]
                    tag = word.split('/', 1)[1]
                    if tag in Ccon:
                        con.append(word)
                    elif tag in Cfunc:
                        func.append(word)
                doc_complete["con"].append(' '.join(con))
                doc_complete["func"].append(' '.join(func))
            else:
                doc_complete["con"].append(text)
                doc_complete["func"].append(text)

            result.append([tweet_id, doc_complete])

        # insert into table penentuan kelas
        for tweet in result:
            tweet_id, text = tweet[0], tweet[1]
            content, function = ''.join(text["con"]), ''.join(text["func"])

            tb_penentuan_kelas = PenentuanKelas()
            tb_penentuan_kelas.content = content
            tb_penentuan_kelas.function = function
            tb_penentuan_kelas.tweet_id = tweet_id
            tb_penentuan_kelas.crawler_id = crawler.id
            db.session.add(tb_penentuan_kelas)
            db.session.commit()

        # LDA
        tweets_penentuan_kelas = PenentuanKelas.query.filter_by(
            crawler_id=crawler.id)

        # get tweets content
        tweets_content_tagged = []
        for tweet in tweets_penentuan_kelas:
            tweets_content_tagged.append(tweet.content)

        # separate word and tag
        documents = []
        for tweet in tweets_content_tagged:
            tweet_split = tweet.split(' ')
            temp = []
            for word in tweet_split:
                w = word.split("/", 1)[0]
                temp.append(w)
            documents.append(temp)

        # do process lda
        lda = LdaModel(documents, int(4), float(0.001), float(0.001),
                       int(1000))
        result = lda.get_topic_word_pwz(tweets_content_tagged)

        # insert into table ldapwz
        for r in result:
            topic, word, pwz = r[0], r[1], r[2]

            tb_ldapwz = LdaPWZ()
            tb_ldapwz.topic = topic
            tb_ldapwz.word = word
            tb_ldapwz.pwz = pwz
            tb_ldapwz.crawler_id = crawler.id
            db.session.add(tb_ldapwz)
            db.session.commit()

        # GRAMMAR STORY
        ldapwz = LdaPWZ.query.filter_by(crawler_id=crawler.id)

        # get topic with words in dictionary
        dict_ldapwz = defaultdict(list)
        for data in ldapwz:
            dict_ldapwz[data.topic].append([data.word, data.pwz])

        # initialize
        cfg_informasi = CFG_Informasi()
        cfg_cerita = CFG_Cerita()

        # create sentence for story
        dict_story_informasi = cfg_informasi.create_sentences_from_data(
            dict(dict_ldapwz))
        dict_story_cerita = cfg_cerita.create_sentences_from_data(
            dict(dict_ldapwz))

        # join into dict_story
        dict_story = defaultdict(list)
        for d in (dict_story_informasi, dict_story_cerita):
            for key, value in d.items():
                dict_story[key].append('. '.join(i.capitalize()
                                                 for i in value))

        # insert into table GrammarStory
        for topic, stories in dict_story.items():
            # insert informasi
            tb_grammar_story = GrammarStory()
            tb_grammar_story.topic = topic
            tb_grammar_story.rules = 'informasi'
            tb_grammar_story.story = stories[0]
            tb_grammar_story.crawler_id = crawler.id
            db.session.add(tb_grammar_story)
            db.session.commit()

            # insert cerita
            tb_grammar_story = GrammarStory()
            tb_grammar_story.topic = topic
            tb_grammar_story.rules = 'cerita'
            tb_grammar_story.story = stories[1]
            tb_grammar_story.crawler_id = crawler.id
            db.session.add(tb_grammar_story)
            db.session.commit()

    c = Crawler.query.order_by(Crawler.id.desc()).all()

    return render_template("stories.html",
                           crawler=c,
                           form_stweets=form_stweets)