예제 #1
0
    def get_sentiment(text: str) -> Optional[NamedTuple]:
        """
        Returns a named tuple Sentiment containing polarity and subjectivity of the input text
        :param text: string containing text to analyze
        :return: if input text non-empty, a named tuple Sentiment, None otherwise
        """
        if is_string_empty(text):
            return None

        return TextBlob(text).sentiment
예제 #2
0
    def get_subjectivity(text: str) -> Optional[float]:
        """
        Returns the subjectivity of the input text
        :param text: string containing text to analyze
        :return: a float [0 ,1] indicating the subjectivity of the supplied text if nonempty else None
        """
        if is_string_empty(text):
            return None

        return TextBlob(text).subjectivity
예제 #3
0
    def process(self, text: str) -> Optional[str]:
        """
        Process a single string by applying cleaning methods defined in text.py
        :param text: string to process
        :return: processed string, None if supplied or output string is None or if returned text would be empty
        """
        if is_string_empty(text):
            return None

        text = strip_html(text)
        text = remove_stopwords(text)
        text = transform_accented_chars(text)
        if self.to_lowercase:
            text = lowercase(text)
        text = remove_nonalphanumeric(text)
        text = re.sub(" +", " ", text)
        text = text.strip()

        if is_string_empty(text):
            return None

        return text
예제 #4
0
    def process_multiprocessing(
        self,
        texts: Iterable[str],
        file_name: str,
        total_count: int,
        batch_size: int = 100000,
    ):
        """
        Parallelize the process method using n - 1 CPU cores and save results in batches to a file
        :param texts: an iterable with string objects
        :param file_name: path to the file to append preprocessed texts to
        :param total_count: total number of records to iterate over
        :param batch_size: the size of a single text list to use for parallel preprocessing
        :return:
        """

        num_processed = 0

        with tqdm(total=total_count) as pbar:
            with open(file_name, "a") as file:
                with Pool(NUM_CORES) as pool:
                    while True:
                        # Create a batch of texts as a list for use in pool.imap
                        current_batch = []
                        for _ in range(batch_size):
                            current_batch.append(next(texts))

                        # Break the loop if no more texts to process
                        if len(current_batch) == 0:
                            break

                        # Process texts in parallel
                        text_processed_batch = list(
                            tqdm(
                                pool.imap(self.process, current_batch),
                                total=total_count,
                            ))

                        # Save processed texts to a file, with each text in a new line
                        for text in text_processed_batch:
                            if not is_string_empty(text):
                                file.write(text + "\n")

                            num_processed += 1
                            pbar.update(num_processed)
def run(
    data_generator,
    process_text: bool,
    batch_size: int = 100000,
):
    """
    Compute and insert sentiment scores (polarity, subjectivity) into DB for all existing items
    :param data_generator: an iterable where each element is a single row from the DB
    :param process_text: whether to process the text or not before computing sentiment and inserting into DB
    :param batch_size: size of the batch to use for the named cursor when querying DB for data
    :return:
    """
    logging.info("starting task %s", __name__)
    conn = DBConnection(
        user="******", password=DB_PASSWORD, db_name=DB_NAME_HACKERNEWS
    )
    text_inserter = TextInserter(conn, TABLE_NAME_TEXTS, PRIMARY_KEY_NAME_TEXTS)
    sentiment_classifier = SentimentClassifier()
    text_preprocessor = TextPreprocessor()
    is_generator_exhausted = False

    if process_text:
        while not is_generator_exhausted:
            current_batch = []
            for _ in range(batch_size):
                try:
                    current_batch.append(next(data_generator))
                except StopIteration:
                    logging.info("generator %s exhausted, finishing", data_generator)
                    is_generator_exhausted = True
                    break

            if len(current_batch) == 0:
                break

            for item_id, title, text in tqdm(current_batch):
                # Preprocess "text" field if not empty, otherwise preprocess title (stories don't have text)
                if is_string_empty(text):
                    raw_text = title
                else:
                    raw_text = text

                text_preprocessed = text_preprocessor.process(raw_text)

                text_obj = Text(item_id, text_preprocessed)

                # Insert preprocessed text
                text_inserter.insert_text(text_obj)

                # Use unprocessed text for sentiment computation
                sentiment = sentiment_classifier.get_sentiment(raw_text)
                text_inserter.insert_sentiment(sentiment, item_id)
    else:
        while True:
            current_batch = []
            for _ in range(batch_size):
                current_batch.append(next(data_generator))

            if len(current_batch) == 0:
                break

            for item_id, title, text in current_batch:
                if is_string_empty(text):
                    raw_text = title
                else:
                    raw_text = text
                sentiment = sentiment_classifier.get_sentiment(raw_text)
                text_inserter.insert_sentiment(sentiment, item_id)

    logging.info("finished task: %s", __name__)