def clean_text(self, extracted_text: str): """Removes stop words and samples words out of tweet to create a snippet. Attributes: extracted_text: A string denoting extracted text from image. Returns: A tuple contaning a tweet snippet as well as Enum ResultStatus which gives out result status. """ if not isinstance(extracted_text, str): raise TypeError('Extracted text must be type string') if not extracted_text: raise ValueError('Extracted text cannot be empty') try: non_punc_tweet = extracted_text.translate( str.maketrans('', '', string.punctuation)) word_tokens = nltk.tokenize.word_tokenize(non_punc_tweet) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) filtered_sentence = [w for w in word_tokens if not w in stopwords] picked_words = filtered_sentence[2:min([len(filtered_sentence), 6])] tweet_snippet = " ".join(picked_words) if not tweet_snippet: return (tweet_snippet, ResultStatus.NO_RESULT) logger.debug(f'Tweet Snippet: {tweet_snippet}') return (tweet_snippet, ResultStatus.ALL_OKAY)
def exec(self, file_path): """Executes controller flow Attributes: file_path: A string denoting a twitter username. Controller uses image service to extract text from image, passes text to text service to parse entities such as username, tweet as well as date, uses search service to retrieve a tweet if available. Returns: search_results: list of tweet objects status: Enum ResultStatus representing result status """ if not isinstance(file_path, str): raise TypeError('File path must be type str') if not file_path: raise ValueError('File path must be a valid string') entities, preprocess_status = common.extract_and_parse(file_path) if preprocess_status != ResultStatus.ALL_OKAY: return (None, ResultStatus.MODULE_FAILURE) try: text_processor = text_service.DataParser() tweet_snippet, text_processor_status = text_processor.clean_text( entities['tweet']) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if text_processor_status != ResultStatus.ALL_OKAY: return (None, text_processor_status) try: search_controller = search_service.TwintSearch() search_results, search_status = search_controller.search( entities['user_id'], tweet_snippet, entities['date']) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if search_status != ResultStatus.ALL_OKAY: return (None, search_status) tweet_text_list = list() for tweet_obj in search_results: tweet_text_list.append(tweet_obj.tweet) validity, match_index, validator_status = common.calculate_and_validate( entities=entities, tweet_text_list=tweet_text_list) if validator_status == ResultStatus.MODULE_FAILURE: return (None, ResultStatus.MODULE_FAILURE) if not validity: return (None, ResultStatus.NO_RESULT) return (search_results[match_index], ResultStatus.ALL_OKAY)
def aggregate_tweets(self, user_id: str, date: datetime.datetime): """Aggregates tweets from a single day. Retrieves tweets pertaining to the given username and date using Twitter Search API. Aggregates tweets to a list. Returns: A list contaning a dict representing a Tweet Object. Ref: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object. For example: :: { "created_at": "Wed Oct 10 20:19:24 +0000 2018", "text": "To make room for more expression, we will now count all emojis as equal—including those with gender and skin t… https://t.co/MkGjXf9aXm" } """ if not isinstance(user_id, str) or not isinstance( date, datetime.datetime): raise TypeError( 'User ID must be type string and date must be type datetime.datetime' ) if not user_id or not date: raise ValueError('User ID or Date cannot be empty') logger.info('Searching for tweet using Twitter API...') querystring = dict({ app_config.TWEET_USERNAME_KEY: user_id, app_config.TWEET_COUNT_KEY: app_config.TWEET_COUNT }) try: response, response_status = self._call_twitter_api(querystring) if response_status != ResultStatus.ALL_OKAY: return (None, response_status) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) same_day_tweets = list() for entry in response: tweet_date = date_parser.parse(entry[app_config.TWEET_DATE_KEY]) if date_checker.format_for_date( tweet_date) == date_checker.format_for_date( date) and date_checker.valid_date(tweet_date): logger.debug('Tweet found...: ' + str(entry[app_config.TWEET_TEXT_KEY])) same_day_tweets.append(entry) if not same_day_tweets: return (same_day_tweets, ResultStatus.NO_RESULT) return (same_day_tweets, ResultStatus.ALL_OKAY)
def get_similarity(self, extracted_tweet: str, same_day_tweets: list): """Calculates a similarity matrix. Calculates a similarity matrix of the corpus containing extracted tweet and tweets aggregated from Twitter Search API using consine similarity approach. Attributes: extracted_tweet: A string denoting extracted tweet from image. same_day_tweets: A list contaning tweets of target date Returns: A tuple contaning a similarity matrix, which is a numpy array as well as Enum ResultStatus which gives out result status. For example: :: ([[1. 0.9258201] [0.9258201 1. ]], ResultStatus.ALL_OKAY) """ if not isinstance(extracted_tweet, str) or not isinstance( same_day_tweets, list): raise TypeError( 'Extracted tweet must be type str and Same day tweets must be type list' ) if not extracted_tweet or not same_day_tweets: raise ValueError( 'Extracted tweet must be a valid string and same day tweets must be a valid list' ) logger.info('Processing similarity of two tweets...') corpus = list() corpus.append(extracted_tweet) corpus.extend(same_day_tweets) logger.debug('Corpus: ' + str(corpus)) try: sparse_matrix = count_vectorizer.fit_transform(corpus) similarity_matrix = cosine_similarity(sparse_matrix, sparse_matrix) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) logger.debug('Similartiy Matrix: ' + str(similarity_matrix)) return (similarity_matrix, ResultStatus.ALL_OKAY)
def search(self, user_id: str, tweet_snippet: str, date: datetime.datetime = None): """Searches for tweets Retrieves tweets of given username, date as well as tweet snippet using Twint. Aggregates tweets to a list. Returns: A tuple contaning a list of results, each result represents a tweet object as well as ResultStatus. For example: :: ([<tweet_obj>], ResultStatus.ALL_OKAY) """ if not isinstance(user_id, str) or not (tweet_snippet, str): raise TypeError( 'User ID and tweet_snippet must be type string, date must be type datetime.datetime' ) if not user_id or not tweet_snippet: raise ValueError('User ID, Tweet or Date cannot be empty') results = list() twint_config = twint.Config() twint_config.Username = user_id if date: twint_config.Since = date_checker.format_for_date(date) twint_config.Until = date_checker.format_for_date(date + datetime.timedelta(days=2)) else: twint_config.Search = tweet_snippet twint_config.Limit = app_config.TWEET_MAX_STORE twint_config.Store_object = True twint_config.Store_object_tweets_list = results try: twint.run.Search(twint_config) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if not results: return (results, ResultStatus.NO_RESULT) logger.debug(f'Search results: {results}\n') return (results, ResultStatus.ALL_OKAY)
def exec(self, file_path: str): """Executes controller flow Controller uses image service to extract text from image, passes text to text service to parse entities such as username, tweet as well as date, uses search service to retrieve same day tweets, text service to find similar tweet and finally verifying the tweet. Attributes: file_path: A string denoting a twitter username. Returns: valid_tweet: A tweet object status: Enum ResultStatus representing result status """ if not isinstance(file_path, str): raise TypeError('File path must be type str') if not file_path: raise ValueError('File path must be a valid string') entities, preprocess_status = common.extract_and_parse(file_path) if preprocess_status != ResultStatus.ALL_OKAY: return (None, ResultStatus.MODULE_FAILURE) try: search_controller = search_service.TwitterAPISearch() same_day_tweets, search_status = search_controller.aggregate_tweets( entities['user_id'], entities['date']) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if search_status != ResultStatus.ALL_OKAY: return (None, search_status) validity, match_index, validator_status = common.calculate_and_validate( entities=entities, same_day_tweets=same_day_tweets) if validator_status != ResultStatus.ALL_OKAY: return (None, ResultStatus.MODULE_FAILURE) return (same_day_tweets[match_index], ResultStatus.ALL_OKAY)
def calculate_and_validate(entities: dict, tweet_text_list: list): """Calculates similarity matrix and validates tweet Calculates a similarity matrix from same day tweet corpus using text service and validates tweet using validator Args: entities: represents dictionary of entities extracted from text tweet_text_list: list of strings representing same day tweets Returns: valid_tweet: Validity status of tweet status: Enum ResultStatus representing result status """ try: text_processor = text_service.TextProcessor() similarity_matrix, processor_status = text_processor.get_similarity( entities['tweet'], tweet_text_list) except Exception as e: logger.exception(e) return (None, None, ResultStatus.MODULE_FAILURE) if processor_status != ResultStatus.ALL_OKAY: return (None, None, processor_status) try: valid_tweet, match_index, validator_status = validator.verify_validity( similarity_matrix) except Exception as e: logger.exception(e) return (None, None, ResultStatus.MODULE_FAILURE) if validator_status == ResultStatus.MODULE_FAILURE: return (None, None, validator_status) logger.debug('Tweet Validity: ' + str(valid_tweet)) if not valid_tweet: return (False, None, ResultStatus.NO_RESULT) return (valid_tweet, match_index-1, ResultStatus.ALL_OKAY)
def extract_and_parse(file_path: str): """Preprocess text from image Extracts text from image using image service, parses entities from text using text service. Args: file_path: represents path of the image file. Returns: entities: Entities parsed from text such as tweet, user_id and date. status: Enum ResultStatus representing result status """ if not isinstance(file_path, str): raise TypeError('File path must be type string') if not file_path: raise ValueError('File path must be a valid path') try: text_extractor = image_service.Extractor() extracted_text, extractor_status = text_extractor.get_text(file_path) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if extractor_status != ResultStatus.ALL_OKAY: return (None, extractor_status) logger.debug('Processed text: ' + extracted_text) try: entity_parser = text_service.DataParser() entities, parser_status = entity_parser.get_entities(extracted_text) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE) if parser_status != ResultStatus.ALL_OKAY: return (None, parser_status) logger.debug('Entities: ' + str(entities)) return (entities, parser_status)
def get_text(self, file_path: str): """Extracts text from image """ if not isinstance(file_path, str): raise TypeError('File path must be type string') if not file_path: raise ValueError('File path cannot be empty') logger.info('Processing Image...') try: new_file_path = self.rescale(file_path) logger.info('Extracting text from rescaled image...') img = PIL.Image.open(new_file_path) text = pytesseract.image_to_string(image=img) try: os.remove(new_file_path) except Exception as e: logger.exception(e) if not text: return (None, ResultStatus.NO_RESULT) return (text, ResultStatus.ALL_OKAY) except Exception as e: logger.exception(e) return (None, ResultStatus.MODULE_FAILURE)