예제 #1
0
def translate_text(text_list):
    translate_analyzer = TranslationAnalyzer(
        model_name_or_path="Helsinki-NLP/opus-mt-hi-en",
        device="auto"
    )
    source_responses = [AnalyzerRequest(processed_text=text.processed_text, source_name="sample") for text in text_list]
    analyzer_responses = translate_analyzer.analyze_input(source_response_list=source_responses)
    return [
        AnalyzerRequest(processed_text=response.segmented_data['translated_text'], source_name="translator")
        for response in analyzer_responses
    ]
예제 #2
0
def test_pii_analyzer_replace_original(pii_analyzer):
    analyzer_config = PresidioPIIAnalyzerConfig(analyze_only=False,
                                                return_decision_process=True,
                                                replace_original_text=True)

    source_responses = [
        AnalyzerRequest(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = pii_analyzer.analyze_input(
        source_response_list=source_responses, analyzer_config=analyzer_config)
    assert len(analyzer_responses) == len(TEXTS)

    for text, analyzer_response in zip(TEXTS, analyzer_responses):

        assert analyzer_response.segmented_data is not None
        assert analyzer_response.segmented_data["analyzer_result"] is not None
        assert analyzer_response.segmented_data[
            "anonymized_result"] is not None
        assert analyzer_response.segmented_data["anonymized_text"] is not None

        for pii_info in PII_LIST:
            assert pii_info not in analyzer_response.segmented_data[
                "anonymized_text"]

        assert (analyzer_response.segmented_data["anonymized_text"] ==
                analyzer_response.processed_text)
        assert analyzer_response.segmented_data["anonymized_text"] != text
예제 #3
0
    def lookup(self, config: PlayStoreConfig,
               **kwargs) -> List[AnalyzerRequest]:
        source_responses: List[AnalyzerRequest] = []
        # Refer https://github.com/googleapis/google-api-python-client/blob/master/docs/start.md
        with build(serviceName='androidpublisher',
                   version='v3',
                   credentials=config.get_google_credentials(),
                   developerKey=config.cred_info.developer_key.
                   get_secret_value()) as service:
            reviews = service.reviews()
            pagination_token: Optional[str] = None

            # Get data from state
            id: str = kwargs.get("id", None)
            state: Dict[
                str,
                Any] = None if id is None else self.store.get_source_state(id)
            start_index: Optional[
                str] = config.start_index or None if state is None else state.get(
                    "start_index", None)
            update_state: bool = True if id else False
            state = state or dict()
            review_id = start_index

            while True:
                # Refer https://googleapis.github.io/google-api-python-client/docs/dyn/androidpublisher_v3.reviews.html#list
                responses = reviews.list(package_name=config.package_name,
                                         max_results=config.max_results,
                                         start_index=start_index,
                                         token=pagination_token)

                if "reviews" in responses:
                    reviews = responses["responses"]
                    for review in reviews:
                        if "comments" not in review:
                            continue

                        review_id = review["reviewId"]

                        # Currently only one user comment is supported
                        text = review["comments"][0]["userComment"]["text"]
                        source_responses.append(
                            AnalyzerRequest(processed_text=text,
                                            meta=review,
                                            source_name=self.NAME))

                pagination_token = None
                if "tokenPagination" in responses:
                    if "nextPageToken" in responses["tokenPagination"]:
                        pagination_token = responses["tokenPagination"][
                            "nextPageToken"]

                if pagination_token is None:
                    break

        if update_state:
            state["start_index"] = review_id
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses
예제 #4
0
    def _get_source_output(self, tweet: Dict[str, Any]):
        tweet_url = TwitterSource.get_tweet_url(tweet["text"])
        processed_text = TwitterSource.clean_tweet_text(tweet["text"])

        tweet["tweet_url"] = tweet_url
        return AnalyzerRequest(processed_text=processed_text,
                               meta=tweet,
                               source_name=self.NAME)
예제 #5
0
    def lookup(self, config: RedditScrapperConfig,
               **kwargs) -> List[AnalyzerRequest]:
        source_responses: List[AnalyzerRequest] = []

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Dict[
            str, Any] = None if id is None else self.store.get_source_state(id)
        update_state: bool = True if id else False
        state = state or dict()

        scrapper_stat: Dict[str, Any] = state.get(config.url_id, dict())
        lookup_period: str = scrapper_stat.get("since_time",
                                               config.lookup_period)
        lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
        if len(lookup_period) <= 5:
            since_time = convert_utc_time(lookup_period)
        else:
            since_time = datetime.strptime(lookup_period,
                                           DATETIME_STRING_PATTERN)

        last_since_time: datetime = since_time

        since_id: Optional[str] = scrapper_stat.get("since_id", None)
        last_index = since_id
        state[config.url_id] = scrapper_stat

        reddit_data: Optional[List[RedditContent]] = None
        try:
            reddit_data = config.get_readers().fetch_content(after=since_time,
                                                             since_id=since_id)
        except RuntimeError as ex:
            logger.warning(ex.__cause__)

        reddit_data = reddit_data or []

        for reddit in reddit_data:
            source_responses.append(
                AnalyzerRequest(
                    processed_text=f"{reddit.title}. {reddit.extracted_text}",
                    meta=reddit.__dict__,
                    source_name=self.NAME))

            if last_since_time is None or last_since_time < reddit.updated:
                last_since_time = reddit.updated
            if last_index is None:
                # Assuming list is sorted based on time
                last_index = reddit.id

        scrapper_stat["since_time"] = last_since_time.strftime(
            DATETIME_STRING_PATTERN)
        scrapper_stat["since_id"] = last_index

        if update_state:
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses
예제 #6
0
    def lookup(self, config: AppStoreScrapperConfig,
               **kwargs) -> List[AnalyzerRequest]:
        source_responses: List[AnalyzerRequest] = []

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Dict[
            str, Any] = None if id is None else self.store.get_source_state(id)
        update_state: bool = True if id else False
        state = state or dict()

        for scrapper in config.get_review_readers():
            country_stat: Dict[str, Any] = state.get(scrapper.country, dict())
            lookup_period: str = country_stat.get("since_time",
                                                  config.lookup_period)
            lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
            if len(lookup_period) <= 5:
                since_time = convert_utc_time(lookup_period)
            else:
                since_time = datetime.strptime(lookup_period,
                                               DATETIME_STRING_PATTERN)

            last_since_time: datetime = since_time

            since_id: Optional[int] = country_stat.get("since_id", None)
            last_index = since_id
            state[scrapper.country] = country_stat

            reviews = scrapper.fetch_reviews(after=since_time,
                                             since_id=since_id)
            reviews = reviews or []

            for review in reviews:
                source_responses.append(
                    AnalyzerRequest(
                        processed_text=f"{review.title}. {review.content}",
                        meta=review.__dict__,
                        source_name=self.NAME,
                    ))

                if review.date < since_time:
                    break
                if last_since_time is None or last_since_time < review.date:
                    last_since_time = review.date
                if last_index is None or last_index < review.id:
                    last_index = review.id

            country_stat["since_time"] = last_since_time.strftime(
                DATETIME_STRING_PATTERN)
            country_stat["since_id"] = last_index

        if update_state:
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses
예제 #7
0
def test_translate_analyzer(translate_analyzer):
    source_responses = [
        AnalyzerRequest(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = translate_analyzer.analyze_input(
        source_response_list=source_responses)
    assert len(analyzer_responses) == len(TEXTS)

    for text, analyzer_response in zip(TEXTS, analyzer_responses):
        assert analyzer_response.segmented_data is not None
        assert text != analyzer_response.segmented_data["translated_text"]
예제 #8
0
def test_vader_analyzer(vader_analyzer):
    source_responses = [AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS]
    analyzer_responses = vader_analyzer.analyze_input(
        source_response_list=source_responses
    )

    assert len(analyzer_responses) == len(TEXTS)

    for analyzer_response in analyzer_responses:
        assert len(analyzer_response.segmented_data) == 2
        assert "positive" in analyzer_response.segmented_data
        assert "negative" in analyzer_response.segmented_data
예제 #9
0
def test_translate_analyzer(translate_analyzer):
    source_responses = [
        AnalyzerRequest(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = translate_analyzer.analyze_input(
        source_response_list=source_responses)
    assert len(analyzer_responses) == len(TEXTS)

    logger.info("Result:")
    for analyzer_response in analyzer_responses:
        # print(analyzer_response)
        logger.info(analyzer_response)
예제 #10
0
def test_zero_shot_analyzer(zero_shot_analyzer):
    labels = ["facility", "food", "comfortable", "positive", "negative"]

    source_responses = [AnalyzerRequest(processed_text=text, source_name="sample") for text in TEXTS]
    analyzer_responses = zero_shot_analyzer.analyze_input(
        source_response_list=source_responses,
        analyzer_config=ClassificationAnalyzerConfig(
            labels=labels
        )
    )

    assert len(analyzer_responses) == len(TEXTS)

    for analyzer_response in analyzer_responses:
        assert len(analyzer_response.segmented_data) == len(labels)
        assert "positive" in analyzer_response.segmented_data
        assert "negative" in analyzer_response.segmented_data
예제 #11
0
def classify_texts(request: ClassifierRequest):
    global rate_limiter
    global analyzer
    with rate_limiter.run():
        analyzer_requests: List[AnalyzerRequest] = [
            AnalyzerRequest(processed_text=text, source_name="API")
            for text in request.texts
        ]
        analyzer_responses = analyzer.analyze_input(
            source_response_list=analyzer_requests,
            analyzer_config=request.analyzer_config,
        )

        response = []
        for analyzer_response in analyzer_responses:
            response.append(analyzer_response.segmented_data)

        return ClassifierResponse(data=response)
예제 #12
0
def test_pii_analyzer_analyze_only(pii_analyzer):
    analyzer_config = PresidioPIIAnalyzerConfig(analyze_only=True,
                                                return_decision_process=True)

    source_responses = [
        AnalyzerRequest(processed_text=text, source_name="sample")
        for text in TEXTS
    ]
    analyzer_responses = pii_analyzer.analyze_input(
        source_response_list=source_responses, analyzer_config=analyzer_config)
    assert len(analyzer_responses) == len(TEXTS)

    for text, analyzer_response in zip(TEXTS, analyzer_responses):

        assert analyzer_response.segmented_data is not None
        assert analyzer_response.segmented_data["analyzer_result"] is not None
        assert analyzer_response.segmented_data["anonymized_result"] is None

        assert text == analyzer_response.processed_text
예제 #13
0
def test_ner_analyzer(ner_analyzer):
    source_responses = [
        AnalyzerRequest(
            processed_text="My name is Lalit and I live in Berlin, Germany.",
            source_name="sample",
        )
    ]
    analyzer_responses = ner_analyzer.analyze_input(
        source_response_list=source_responses)

    assert len(analyzer_responses) == 1

    entities = analyzer_responses[0].segmented_data["data"]
    matched_count = 0
    for entity in entities:
        if entity["word"] == "Lalit" and entity["entity_group"] == "PER":
            matched_count = matched_count + 1
        elif entity["word"] == "Berlin" and entity["entity_group"] == "LOC":
            matched_count = matched_count + 1
        elif entity["word"] == "Germany" and entity["entity_group"] == "LOC":
            matched_count = matched_count + 1

    assert matched_count == 3
예제 #14
0
    def lookup(self, config: EmailConfig, **kwargs) -> List[AnalyzerRequest]:
        source_responses: List[AnalyzerRequest] = []

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Dict[
            str, Any] = None if id is None else self.store.get_source_state(id)
        update_state: bool = True if id else False
        state = state or dict()

        imap_client = config.get_client()

        for mailbox in config.mailboxes:
            need_more_lookup = True

            status, messages = imap_client.select(mailbox=mailbox,
                                                  readonly=True)
            if status != 'OK':
                logger.warning(f"Not able to connect with {mailbox}: {status}")
                continue

            mailbox_stat: Dict[str, Any] = state.get(mailbox, dict())
            lookup_period: str = mailbox_stat.get(
                "since_time", config.lookup_period or DEFAULT_LOOKUP_PERIOD)
            if len(lookup_period) <= 5:
                since_time = convert_utc_time(lookup_period)
            else:
                since_time = datetime.strptime(lookup_period,
                                               DATETIME_STRING_PATTERN)

            if since_time.tzinfo is None:
                since_time = since_time.replace(tzinfo=pytz.utc)
            else:
                since_time = since_time.astimezone(pytz.utc)

            last_since_time: datetime = since_time
            since_id: Optional[int] = mailbox_stat.get("since_message_id",
                                                       None)
            last_index = since_id

            state[mailbox] = mailbox_stat

            num_of_emails = int(messages[0])

            # Read in reverse order means latest emails first
            # Most of code is borrowed from https://www.thepythoncode.com/article/reading-emails-in-python and
            # modified to suite here
            for index in range(num_of_emails, 0, -1):
                email_meta: Dict[str, Any] = dict()

                # fetch the email message by ID
                status, email_message = imap_client.fetch(
                    str(index), "(RFC822)")

                email_content: str = ""

                for response in email_message:
                    if isinstance(response, tuple):
                        # parse a bytes email into a message object
                        msg = email.message_from_bytes(response[1])

                        email_meta["subject"] = self._parse_email_header(
                            msg, "Subject")
                        email_meta["from_address"] = self._parse_email_header(
                            msg, "From")
                        email_meta["to_address"] = self._parse_email_header(
                            msg, "To")
                        date_received_str = self._parse_email_header(
                            msg, "Date")

                        try:
                            date_received = datetime.strptime(
                                date_received_str, "%a, %d %b %Y %H:%M:%S %Z")
                        except Exception:
                            try:
                                date_received = datetime.strptime(
                                    date_received_str,
                                    "%a, %d %b %Y %H:%M:%S %z")
                            except Exception:
                                date_received = datetime.strptime(
                                    date_received_str,
                                    "%a, %d %b %Y %H:%M:%S %z (%Z)")

                        if date_received.tzinfo is None:
                            date_received = date_received.replace(
                                tzinfo=pytz.utc)
                        else:
                            date_received = date_received.astimezone(pytz.utc)
                        email_meta["date_received"] = date_received
                        email_meta["message_id"] = self._parse_email_header(
                            msg, "Message-ID")

                        part_id = 0
                        # if the email message is multipart
                        if msg.is_multipart():
                            # iterate over email parts
                            for part in msg.walk():
                                part_id_str = f'part_{part_id}'
                                # extract content type of email
                                content_type = part.get_content_type()
                                content_disposition = str(
                                    part.get("Content-Disposition"))

                                email_meta[part_id_str] = dict()
                                email_meta[part_id_str][
                                    "content_type"] = content_type
                                email_meta[part_id_str][
                                    "content_disposition"] = content_disposition

                                if "attachment" not in content_disposition and "text/" in content_type:
                                    try:
                                        # get the email body
                                        email_body = part.get_payload(
                                            decode=True).decode()
                                        if content_type == "text/html":
                                            email_body = text_from_html(
                                                email_body)
                                        # append email body with existing
                                        email_meta[part_id_str][
                                            "email_body"] = email_body
                                        email_content = email_content + "\n" + email_body
                                    except Exception:
                                        logger.error(
                                            "Unable to parse email body")
                                elif "attachment" in content_disposition:
                                    logger.warning(
                                        "Email attachment download is not supported"
                                    )
                                    # Download attachment is commented currently
                                    # # download attachment
                                    # filename = part.get_filename()
                                    # if filename:
                                    #    folder_name = self.clean(subject)
                                    #    if not os.path.isdir(folder_name):
                                    #        # make a folder for this email (named after the subject)
                                    #        os.mkdir(folder_name)
                                    #    filepath = os.path.join(folder_name, filename)
                                    #    # download attachment and save it
                                    #    open(filepath, "wb").write(part.get_payload(decode=True))

                                part_id = part_id + 1
                        else:
                            part_id_str = f'part_{part_id}'
                            email_meta[part_id_str] = dict()
                            # extract content type of email
                            content_type = msg.get_content_type()
                            email_meta[part_id_str][
                                "content_type"] = content_type

                            # get the email body
                            email_body = msg.get_payload(decode=True).decode()
                            if content_type == "text/html":
                                email_body = text_from_html(email_body)

                            email_meta[part_id_str]["email_body"] = email_body
                            email_content = email_content + "\n" + email_body

                        if date_received < since_time:
                            need_more_lookup = False
                            break
                        if last_index and last_index == email_meta[
                                "message_id"]:
                            need_more_lookup = False
                            break
                        if last_since_time is None or last_since_time < date_received:
                            last_since_time = date_received
                        if last_index is None:
                            last_index = email_meta["message_id"]

                        source_responses.append(
                            AnalyzerRequest(processed_text="\n".join(
                                [email_meta.get("subject"), email_content]),
                                            meta=email_meta,
                                            source_name=self.NAME))

                if not need_more_lookup:
                    break

            mailbox_stat["since_time"] = last_since_time.strftime(
                DATETIME_STRING_PATTERN)
            mailbox_stat["since_comment_id"] = last_index

        if update_state:
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses
예제 #15
0
from obsei.analyzer.base_analyzer import AnalyzerRequest
from obsei.analyzer.pii_analyzer import (
    PresidioEngineConfig,
    PresidioModelConfig,
    PresidioPIIAnalyzer,
    PresidioPIIAnalyzerConfig,
)

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

analyzer_config = PresidioPIIAnalyzerConfig(
    analyze_only=False, return_decision_process=True
)
analyzer = PresidioPIIAnalyzer(
    engine_config=PresidioEngineConfig(
        nlp_engine_name="spacy",
        models=[PresidioModelConfig(model_name="en_core_web_lg", lang_code="en")],
    )
)

text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555"

analyzer_results = analyzer.analyze_input(
    source_response_list=[AnalyzerRequest(processed_text=text_to_anonymize)],
    analyzer_config=analyzer_config,
)

for analyzer_result in analyzer_results:
    logging.info(analyzer_result.to_dict())
예제 #16
0
    def lookup(self, config: RedditConfig, **kwargs) -> List[AnalyzerRequest]:
        source_responses: List[AnalyzerRequest] = []

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Dict[
            str, Any] = None if id is None else self.store.get_source_state(id)
        update_state: bool = True if id else False
        state = state or dict()

        subreddit_reference = config.get_reddit_client().subreddit("+".join(
            config.subreddits))
        post_stream = subreddit_reference.stream.submissions(pause_after=-1)
        for post in post_stream:
            if post is None:
                break

            post_data = vars(post)
            post_id = post_data["id"]
            if config.post_ids and not config.post_ids.__contains__(post_id):
                continue

            post_stat: Dict[str, Any] = state.get(post_id, dict())
            lookup_period: str = post_stat.get("since_time",
                                               config.lookup_period)
            lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
            if len(lookup_period) <= 5:
                since_time = convert_utc_time(lookup_period)
            else:
                since_time = datetime.strptime(lookup_period,
                                               DATETIME_STRING_PATTERN)

            last_since_time: datetime = since_time

            since_id: Optional[str] = post_stat.get("since_comment_id", None)
            last_index = since_id
            state[post_id] = post_stat

            post.comment_sort = "new"
            post.comments.replace_more(limit=None)

            # top_level_comments only
            first_comment = True
            for comment in post.comments:
                comment_data = vars(comment)
                if config.include_post_meta:
                    comment_data[config.post_meta_field] = post_data

                comment_time = datetime.utcfromtimestamp(
                    int(comment_data["created_utc"]))
                comment_id = comment_data["id"]

                if comment_time < since_time:
                    break
                if last_index and last_index == comment_id:
                    break
                if last_since_time is None or last_since_time < comment_time:
                    last_since_time = comment_time
                if last_index is None or first_comment:
                    last_index = comment_id
                    first_comment = False

                text = "".join(text_from_html(comment_data["body_html"]))

                source_responses.append(
                    AnalyzerRequest(processed_text=text,
                                    meta=comment_data,
                                    source_name=self.NAME))

            post_stat["since_time"] = last_since_time.strftime(
                DATETIME_STRING_PATTERN)
            post_stat["since_comment_id"] = last_index

        if update_state:
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses
예제 #17
0
    def lookup(
        self, config: PlayStoreScrapperConfig, **kwargs
    ) -> List[AnalyzerRequest]:
        source_responses: List[AnalyzerRequest] = []

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Dict[str, Any] = None if id is None else self.store.get_source_state(id)
        update_state: bool = True if id else False
        state = state or dict()

        for country in config.countries:
            country_stat: Dict[str, Any] = state.get(country, dict())
            lookup_period: str = country_stat.get("since_time", config.lookup_period)
            lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
            if len(lookup_period) <= 5:
                since_time = convert_utc_time(lookup_period)
            else:
                since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)

            last_since_time: datetime = since_time

            # since_id: Optional[str] = country_stat.get("since_id", None)
            # last_index = since_id
            # state[scrapper.country] = country_stat

            continuation_token: Optional[ContinuationToken] = None
            while True:
                store_reviews, continuation_token = reviews(
                    app_id=config.package_name,
                    lang=config.language,
                    country=country,
                    sort=Sort.NEWEST,
                    filter_score_with=config.filter_score_with,
                    continuation_token=continuation_token,
                    count=config.max_count,
                )
                store_reviews = store_reviews or []

                for review in store_reviews:
                    source_responses.append(
                        AnalyzerRequest(
                            processed_text=review["content"],
                            meta=review,
                            source_name=self.NAME,
                        )
                    )

                    if since_time > review["at"]:
                        break

                    if last_since_time is None or last_since_time < review["at"]:
                        last_since_time = review["at"]
                    # if last_index is None or last_index < review.id:
                    #    last_index = review.id

                if (
                    continuation_token is None
                    or continuation_token.token is None
                    or continuation_token.count <= len(source_responses)
                ):
                    break

            country_stat["since_time"] = last_since_time.strftime(
                DATETIME_STRING_PATTERN
            )
            # country_stat["since_id"] = last_index

        if update_state:
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses