Пример #1
0
def detect_gender(tweet):
    """
    Detect the gender of the tweet's author.
    :param tweet:   the tweet object
    :return:        nothing, the tweet object will be updated inline
    """
    orig_name = tweet['user']['name']
    name = re.sub(r'[^\x00-\x7f]', r'',
                  orig_name)  # remove non-ASCII characters
    gender = 'NA'

    try:
        first_name = extract_first_name(name)

        global gp
        gender = gp.classify(first_name)
    except IndexError:
        # first name is most likely empty in this case
        # probably due to the name containing non-standard characters
        pass
    except Exception:
        log.exception("Unable to detect gender based on first name")
        pass

    tweet['user']['gender'] = gender
    log.debug(
        f"Tweet[{tweet['id']}] Name: {orig_name} - First name: {first_name} - Gender: {gender}"
    )
Пример #2
0
 def on_exception(self, exception):
     log.exception(f"Exception from Twitter streaming: {exception}")
     if self._stream():
         if not self._stream().running:
             # TODO: fix this issue properly
             time.sleep(2)
             self._stream().start()
Пример #3
0
    def run(self):
        interval = 60   # seconds
        while not self.is_stopped():
            try:
                free = _get_percent_free_space()
                log.info(f"Disk usage: {free}% free")

                threshold = config.DISK_FREE_THRESHOLD
                if free < threshold:
                    log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%")
                    clean_old_data(days=7)

                    free = _get_percent_free_space()
                    while free < threshold:
                        log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%")
                        force_clean_old_tweets(count=1000)
                        free = _get_percent_free_space()

                    log.info(f"Finished cleaning disk space [{free}% free]")
            except Exception:
                log.exception("Error during disk usage monitoring")
            finally:
                if self.is_stopped():
                    return
                self.sleep(interval)
Пример #4
0
def start_worker():
    q = connect_to_message_queue()
    log.info("Started listening for message")
    while 1:
        try:
            tweet = q.pop()
            process(tweet)
        except Exception:
            log.exception(f"Error during execution")
Пример #5
0
    def run(self):
        # start watching for changes in a different thread
        from threading import Thread
        worker = Thread(target=self._watch)
        worker.start()

        # wait until being stopped
        self.sleep()

        # stop the MongoDB change stream and cleanup
        try:
            if self._cursor:
                self._cursor.close()
            worker.join()
        except Exception:
            log.exception("Error while stopping the thread")
Пример #6
0
def tweets():
    """
    Return tweets from database that has been processed from the latest one.

    :param limit:   limit the number of result to be returned
    """
    try:
        limit = int(request.args.get('limit', 0))

        result = db.result.find()\
            .sort('$natural', pymongo.DESCENDING)\
            .limit(limit)
        return jsonify(result=[
            json.loads(json.dumps(item, indent=4, default=json_util.default))
            for item in result
        ])
    except Exception:
        log.exception(f"Unable to retrieve tweets")
        return Response("Unable to retrieve tweets", status=500)
Пример #7
0
def search_tweets():
    try:
        req = request.get_json(force=True)
        keyword = req.get('keyword')
        limit = int(req.get('limit', 1000))

        if not keyword:
            return Response("Keyword is not specified", 400)

        api = _get_api()
        result = (process(tweet._json) for tweet in tweepy.Cursor(
            api.search, q=keyword, count=limit).items(limit))
        return jsonify(result=[
            json.loads(json.dumps(item, indent=4, default=json_util.default))
            for item in result
        ])
    except Exception:
        log.exception(f"Unable to search tweets")
        return Response("Unable to search tweets", status=500)
Пример #8
0
    def on_data(self, raw_data):
        try:
            data = json.loads(raw_data)

            # The data is not always a tweet but can also be a message from Twitter system itself
            if 'limit' in data:
                return self.on_limit(data)
            elif 'text' not in data:
                log.warn(f"Unknown message type: {data}")
                # TODO: what to do with unknown message?

                db.unknown.insert(data)
                return

            if config.OPERATION_MODE == 'normal':
                process(data)
            elif config.OPERATION_MODE == 'mq':
                QUEUE.push(data)
        except Exception:
            log.exception("Exception while processing tweet")
Пример #9
0
def filter_tweets_by_date():
    """
    Retrieve tweets by start and end dates.
    """
    try:
        req = request.get_json(force=True)
        start_date = req.get('start_date')
        end_date = req.get('end_date')

        try:
            # ensure that the specified time is in UTC +0000
            date_format = '%d/%m/%Y %z'
            start_date = datetime.strptime(start_date + " +0000",
                                           date_format).timestamp() * 1000
            end_date = datetime.strptime(end_date + " +0000",
                                         date_format).timestamp() * 1000

            if start_date >= end_date:
                raise Exception(
                    "End date is equal to or earlier than start date")
        except Exception:
            log.exception("Date(s) are invalid")
            return Response("Date(s) are invalid", status=400)

        result = db.result.find(
            {'timestamp_ms': {
                '$gte': start_date,
                '$lt': end_date
            }})

        return jsonify(result=[
            json.loads(json.dumps(item, indent=4, default=json_util.default))
            for item in result
        ])
    except Exception:
        log.exception(f"Unable to filter tweets")
        return Response("Unable to filter tweets", status=500)
Пример #10
0
def init():
    """
    Initialize the sentiment analyser.
    This method must be called before any usage of this module.
    """
    try:
        log.info("Loading pre-trained classifier for sentiment analysis")
        load_classifier()
    except FileNotFoundError:
        log.info("Not found pre-trained classifier. Will train a new one")
        train_and_test_classifier()
        try:
            load_classifier()
        except Exception:
            log.exception(
                "Unable to train and test new classifier. Sentiment analysis will not work"
            )
            return
    except Exception:
        log.exception(
            "Unable to load the classifier. Sentiment analysis will not work")
        return

    log.info("Loaded pre-trained classifier for sentiment analysis")
Пример #11
0
 def _watch(self):
     while not self.is_stopped():
         log.debug(
             f"Start watching for changes for {self._collection.name}")
         try:
             self._cursor = self._collection.watch(self._pipeline)
             while not self.is_stopped():
                 try:
                     doc = next(self._cursor)
                     if doc and callable(self._on_change):
                         self._on_change()
                 except StopIteration:
                     break
         except pymongo.errors.OperationFailure:
             log.exception("Operation fails. Updates will not be watched")
             self.stop()
             break
         except pymongo.errors.PyMongoError:
             log.exception(f"Error during watching for updates")
         except Exception:
             log.exception("Other exception occurs")
Пример #12
0
    def generate():
        import io, csv
        output = io.StringIO()
        writer = csv.DictWriter(output,
                                dialect='unix',
                                fieldnames=[
                                    "id",
                                    "timestamp",
                                    "text",
                                    "hashtags",
                                    "reply_to",
                                    "mentions",
                                    "keywords",
                                    "gender",
                                    "longitude",
                                    "latitude",
                                    "city",
                                    "state",
                                    "country_code",
                                    "sentiment",
                                ])

        writer.writeheader()
        output.seek(0)
        yield output.getvalue()
        output.truncate(0)

        # helper functions to retrieve coordinates
        get_long = lambda tweet: safe_get(
            safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]),
            0, '')
        get_lat = lambda tweet: safe_get(
            safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]),
            1, '')

        # the number of result to get
        limit = int(request.args.get('limit', 0))

        for tweet in db.result.find()\
                .sort('$natural', pymongo.DESCENDING)\
                .limit(limit):
            try:
                writer.writerow({
                    'id':
                    tweet['id'],
                    'timestamp':
                    tweet['timestamp'],
                    'text':
                    tweet['text'],
                    'hashtags':
                    ','.join(tweet['entities'].get('hashtags', [])),
                    'reply_to':
                    tweet.get('reply_to', ''),
                    'mentions':
                    ','.join(i['screen_name']
                             for i in tweet['entities']['user_mentions']),
                    'keywords':
                    ','.join(
                        tweet.get('keywords') if tweet.
                        get('keywords') is not None else []),
                    'gender':
                    tweet.get('gender', ''),
                    'longitude':
                    get_long(tweet),
                    'latitude':
                    get_lat(tweet),
                    'city':
                    safe_get_dict(tweet, ['place', 'city'], ''),
                    'state':
                    safe_get_dict(tweet, ['place', 'state'], ''),
                    'country_code':
                    safe_get_dict(tweet, ['place', 'country_code'], ''),
                    'sentiment':
                    tweet.get('sentiment', ''),
                })

                yield output.getvalue()
            except Exception:
                log.exception(f"Error during exporting tweet [{tweet['id']}]")
            finally:
                output.seek(0)
                output.truncate(0)
Пример #13
0
def export_tweets():
    """
    Export all results as CSV file from the latest tweet.

    :param limit:   limit the number of result to be returned
    """
    @copy_current_request_context
    def generate():
        import io, csv
        output = io.StringIO()
        writer = csv.DictWriter(output,
                                dialect='unix',
                                fieldnames=[
                                    "id",
                                    "timestamp",
                                    "text",
                                    "hashtags",
                                    "reply_to",
                                    "mentions",
                                    "keywords",
                                    "gender",
                                    "longitude",
                                    "latitude",
                                    "city",
                                    "state",
                                    "country_code",
                                    "sentiment",
                                ])

        writer.writeheader()
        output.seek(0)
        yield output.getvalue()
        output.truncate(0)

        # helper functions to retrieve coordinates
        get_long = lambda tweet: safe_get(
            safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]),
            0, '')
        get_lat = lambda tweet: safe_get(
            safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]),
            1, '')

        # the number of result to get
        limit = int(request.args.get('limit', 0))

        for tweet in db.result.find()\
                .sort('$natural', pymongo.DESCENDING)\
                .limit(limit):
            try:
                writer.writerow({
                    'id':
                    tweet['id'],
                    'timestamp':
                    tweet['timestamp'],
                    'text':
                    tweet['text'],
                    'hashtags':
                    ','.join(tweet['entities'].get('hashtags', [])),
                    'reply_to':
                    tweet.get('reply_to', ''),
                    'mentions':
                    ','.join(i['screen_name']
                             for i in tweet['entities']['user_mentions']),
                    'keywords':
                    ','.join(
                        tweet.get('keywords') if tweet.
                        get('keywords') is not None else []),
                    'gender':
                    tweet.get('gender', ''),
                    'longitude':
                    get_long(tweet),
                    'latitude':
                    get_lat(tweet),
                    'city':
                    safe_get_dict(tweet, ['place', 'city'], ''),
                    'state':
                    safe_get_dict(tweet, ['place', 'state'], ''),
                    'country_code':
                    safe_get_dict(tweet, ['place', 'country_code'], ''),
                    'sentiment':
                    tweet.get('sentiment', ''),
                })

                yield output.getvalue()
            except Exception:
                log.exception(f"Error during exporting tweet [{tweet['id']}]")
            finally:
                output.seek(0)
                output.truncate(0)

    try:
        response = Response(stream_with_context(generate()),
                            mimetype='text/csv')
        response.headers[
            'Content-Disposition'] = 'attachment; filename=result.csv'
        return response
    except Exception:
        log.exception(f"Unable to export tweets")
        return Response("Unable to export tweets", status=500)
Пример #14
0
def process(tweet):
    """
    Perform processing of tweet data.

    :param tweet:   the tweet object
    :return:        a new tweet object
    """
    try:
        log.debug(f"Processing tweet[{tweet['id']}]")
        # extract the full text from the tweet
        if tweet.get('truncated') and tweet.get('extended_tweet'):
            full_text = tweet['extended_tweet']['full_text']
            tweet['text'] = full_text

        # tweets obtained via Search API do not have `timestamp_ms` field by default
        # we need to derive it from `created_at`
        if not tweet.get('timestamp_ms'):
            datetime_format = '%a %b %d %H:%M:%S %z %Y'
            tweet['timestamp_ms'] = datetime.strptime(
                tweet['created_at'], datetime_format).timestamp() * 1000

        hash_tags = [i['text'] for i in tweet['entities'].get('hashtags', [])]
        user_mentions = [{
            'screen_name': i['screen_name'],
            'name': i['name'],
            'id': i['id_str'],
        } for i in tweet['entities'].get('user_mentions', [])]

        # more complex processing steps
        gender.detect_gender(tweet)
        location.detect_location(tweet)
        sentiment.classify(tweet)

        t = {
            'id': tweet['id_str'],
            'user': {
                'id': tweet['user']['id_str'],
                'name': tweet['user']['name'],
                'screen_name': tweet['user']['screen_name'],
                'gender': tweet['user']['gender'],
            },
            'timestamp': tweet['created_at'],
            'timestamp_ms': int(tweet['timestamp_ms']),
            'text': tweet['text'],
            'coordinates': tweet.get('coordinates'),
            'place': {
                'city': safe_get_dict(tweet, ['place', 'city']),
                'state': safe_get_dict(tweet, ['place', 'state']),
                'country_code': safe_get_dict(tweet,
                                              ['place', 'country_code']),
            },
            'keywords': [],  # TODO
            'reply_to': tweet.get('in_reply_to_screen_name'),
            'entities': {
                'hashtags': hash_tags,
                'user_mentions': user_mentions,
            },
            'sentiment': tweet.get('sentiment')
        }
        db.result.insert(t)
    except pymongo.errors.DuplicateKeyError:
        log.warn(f"Tweet [{tweet['id']}] already exists in database")
    except Exception as e:
        log.exception(f"Error during processing tweet [{tweet['id']}]")
Пример #15
0
import pymongo
from datetime import datetime
from flask_pymongo import PyMongo
from hatespeech.api.app import app
from hatespeech.api.logging2 import log
from hatespeech.config import config

try:
    mongo = PyMongo(app)
    db = None
    with app.app_context():
        log.info(f"Establishing database connection")
        db = mongo.db
        log.info(f"Connected to database: {db.client.server_info()}")
except Exception as e:
    log.exception(e)


@app.route('/db/recreate')
def recreate_db():
    """
    Recreate the database.
    """
    import pymongo
    from script import script

    # table for storing categories of hate words
    db.category.drop()
    db.category.create_index([('name', pymongo.ASCENDING)], unique=True)

    # table for storing hate words