示例#1
0
    def get_entities(self, extracted_text: str):
        """Parses entities from extracted text.

        Parses username (denoted by user_id), tweet as well as date from extracted text.

        Attributes:
            extracted_text: A string denoting extracted text from image.

        Returns:
            A tuple contaning a dictionary: a mapping of user_id, tweet and date
            as well as Enum ResultStatus which gives out result status.
            For example: ::

                {
                    "user_id": "elonmusk",
                    "tweet": "Ms. Tree caught the Falcon fairing!!",
                    "date": datetime.datetime(2019, 6, 8, 7, 29, tzinfo=datetime.timezone.utc)
                }

        """
        if not isinstance(extracted_text, str):
            raise TypeError('Extracted text must be type string')
        if not extracted_text:
            raise ValueError('Extracted text cannot be empty')
        logger.info('Parsing data out of extracted text...')
        username_match = re.search(USERNAME_REGEX, extracted_text)
        datetime_match = re.search(DATETIME_REGEX, extracted_text)
        if not username_match:
            return (dict({
                'user_id': None,
                'tweet': None,
                'datetime': None
            }), ResultStatus.NO_RESULT)
        user_id = username_match.group()[1:]
        tweet_start_index = username_match.end()
        tweet_end_index = len(
            extracted_text
        ) - 1 if not datetime_match else datetime_match.start()
        tweet = extracted_text[tweet_start_index:tweet_end_index].strip()
        if not datetime_match:
            return (dict({
                'user_id': user_id,
                'tweet': tweet,
                'date': None
            }), ResultStatus.ALL_OKAY)
        date_str = datetime_match.group().replace('-', '')
        processed_datetime = date_parser.parse(date_str).replace(
            tzinfo=datetime.timezone.utc)
        return (dict({
            'user_id': user_id,
            'tweet': tweet,
            'date': processed_datetime
        }), ResultStatus.ALL_OKAY)
示例#2
0
    def aggregate_tweets(self, user_id: str, date: datetime.datetime):
        """Aggregates tweets from a single day.

        Retrieves tweets pertaining to the given username and date using Twitter Search API.
        Aggregates tweets to a list.

        Returns:
            A list contaning a dict representing a Tweet Object. 
            
            Ref: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object.
            
            For example: ::

                {
                    "created_at": "Wed Oct 10 20:19:24 +0000 2018",
                    "text": "To make room for more expression, we will now count all emojis as equal—including those with gender‍‍‍ ‍‍and skin t… https://t.co/MkGjXf9aXm"
                }

        """
        if not isinstance(user_id, str) or not isinstance(
                date, datetime.datetime):
            raise TypeError(
                'User ID must be type string and date must be type datetime.datetime'
            )
        if not user_id or not date:
            raise ValueError('User ID or Date cannot be empty')
        logger.info('Searching for tweet using Twitter API...')
        querystring = dict({
            app_config.TWEET_USERNAME_KEY: user_id,
            app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT
        })
        try:
            response, response_status = self._call_twitter_api(querystring)
            if response_status != ResultStatus.ALL_OKAY:
                return (None, response_status)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        same_day_tweets = list()
        for entry in response:
            tweet_date = date_parser.parse(entry[app_config.TWEET_DATE_KEY])
            if date_checker.format_for_date(
                    tweet_date) == date_checker.format_for_date(
                        date) and date_checker.valid_date(tweet_date):
                logger.debug('Tweet found...: ' +
                             str(entry[app_config.TWEET_TEXT_KEY]))
                same_day_tweets.append(entry)
        if not same_day_tweets:
            return (same_day_tweets, ResultStatus.NO_RESULT)
        return (same_day_tweets, ResultStatus.ALL_OKAY)
示例#3
0
 def rescale(file_path):
     if not isinstance(file_path, str):
         raise TypeError('File path must be type string')
     if not file_path:
         raise ValueError('File path cannot be empty')
     logger.info('Rescaling Image to 300 dpi...')
     new_file_path = os.path.join(app_config.FILE_DIRECTORY,
                                  str(uuid.uuid1()) + '.png')
     cmd = [
         'convert', file_path, '-resample', '300x300', '-alpha', 'off',
         '-colorspace', 'Gray', '-threshold', '75%', '-density', '300x300',
         '-units', 'PixelsPerCentimeter', '-blur', '1x65000', '-level',
         '50x100%', new_file_path
     ]
     completed_process = subprocess.run(cmd)
     completed_process.check_returncode()
     return new_file_path
示例#4
0
def save_from_url(image_url: str):
    """Saves image given via url to disk
    """
    if not isinstance(image_url, str):
        raise TypeError('image_url must be a string')
    if not image_url:
        raise ValueError('image_url has to be a valid string')
    r = requests.get(image_url, stream=True, allow_redirects=True)
    if r.status_code != 200:
        raise FileNotFoundError()
    filename = image_url.split("/")[-1]
    if filename and allowed_file(filename):
        saved_file_path = os.path.join(app_config.FILE_DIRECTORY, filename)
        logger.info('Saving file to path: ' + saved_file_path)
        r.raw.decode_content = True
        with open(saved_file_path, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
            logger.info('Image successfully downloaded')
            return saved_file_path
示例#5
0
    def get_similarity(self, extracted_tweet: str, same_day_tweets: list):
        """Calculates a similarity matrix.

        Calculates a similarity matrix of the corpus containing
        extracted tweet and tweets aggregated from Twitter Search API
        using consine similarity approach.

        Attributes:
            extracted_tweet: A string denoting extracted tweet from image.
            same_day_tweets: A list contaning tweets of target date

        Returns:
            A tuple contaning a similarity matrix, which is a numpy array
            as well as Enum ResultStatus which gives out result status.
            For example: ::

                ([[1.        0.9258201]
                 [0.9258201 1.       ]], ResultStatus.ALL_OKAY)


        """
        if not isinstance(extracted_tweet, str) or not isinstance(
                same_day_tweets, list):
            raise TypeError(
                'Extracted tweet must be type str and Same day tweets must be type list'
            )
        if not extracted_tweet or not same_day_tweets:
            raise ValueError(
                'Extracted tweet must be a valid string and same day tweets must be a valid list'
            )
        logger.info('Processing similarity of two tweets...')
        corpus = list()
        corpus.append(extracted_tweet)
        corpus.extend(same_day_tweets)
        logger.debug('Corpus: ' + str(corpus))
        try:
            sparse_matrix = count_vectorizer.fit_transform(corpus)
            similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix)
        except Exception as e:
            logger.exception(e)
            return (None, ResultStatus.MODULE_FAILURE)
        logger.debug('Similartiy Matrix: ' + str(similarity_matrix))
        return (similarity_matrix, ResultStatus.ALL_OKAY)
示例#6
0
 def get_text(self, file_path: str):
     """Extracts text from image
     """
     if not isinstance(file_path, str):
         raise TypeError('File path must be type string')
     if not file_path:
         raise ValueError('File path cannot be empty')
     logger.info('Processing Image...')
     try:
         new_file_path = self.rescale(file_path)
         logger.info('Extracting text from rescaled image...')
         img = PIL.Image.open(new_file_path)
         text = pytesseract.image_to_string(image=img)
         try:
             os.remove(new_file_path)
         except Exception as e:
             logger.exception(e)
         if not text:
             return (None, ResultStatus.NO_RESULT)
         return (text, ResultStatus.ALL_OKAY)
     except Exception as e:
         logger.exception(e)
         return (None, ResultStatus.MODULE_FAILURE)