def predict_phishing_words(model_name, model_store_type, email_subject, email_body, min_text_length, label_threshold,
                           word_threshold, top_word_limit, is_return_error):
    model_data = get_model_data(model_name, model_store_type)
    model = demisto_ml.decode_model(model_data)
    text = "%s %s" % (email_subject, email_body)
    res = demisto.executeCommand('WordTokenizerNLP', {'value': text,
                                                      'hashWordWithSeed': demisto.args().get('hashSeed')})
    if is_error(res[0]):
        return_error(res[0]['Contents'])
    tokenized_text_result = res[0]['Contents']
    input_text = tokenized_text_result['hashedTokenizedText'] if tokenized_text_result.get('hashedTokenizedText') else \
        tokenized_text_result['tokenizedText']
    filtered_text, filtered_text_number_of_words = demisto_ml.filter_model_words(input_text, model)
    if filtered_text_number_of_words == 0:
        handle_error("The model does not contains any of the input text words", is_return_error)
    if filtered_text_number_of_words < min_text_length:
        handle_error("The model contains less then %d words" % min_text_length, is_return_error)

    explain_result = demisto_ml.explain_model_words(model,
                                                    input_text,
                                                    0,
                                                    word_threshold,
                                                    top_word_limit)
    if explain_result["Probability"] < label_threshold:
        handle_error("Label probability is %.2f and it's below the input threshold", is_return_error)

    if tokenized_text_result.get('hashedTokenizedText'):
        hash_word_to_plain = dict(
            zip(tokenized_text_result['hashedTokenizedText'].split(" "),
                tokenized_text_result['tokenizedText'].split(" ")))
        explain_result['PositiveWords'] = map(lambda x: hash_word_to_plain[x], explain_result['PositiveWords'])
        explain_result['NegativeWords'] = map(lambda x: hash_word_to_plain[x], explain_result['NegativeWords'])
    explain_result['OriginalText'] = tokenized_text_result['originalText']
    explain_result['TextTokensHighlighted'] = tokenized_text_result['tokenizedText']

    res = demisto.executeCommand('HighlightWords', {'text': tokenized_text_result['tokenizedText'],
                                                    'terms': ",".join(explain_result['PositiveWords'])})
    res = res[0]
    if not is_error(res):
        highlighted_text_markdown = res['Contents']
        explain_result['TextTokensHighlighted'] = highlighted_text_markdown
    explain_result_hr = dict(explain_result)
    explain_result_hr['PositiveWords'] = ", ".join(explain_result_hr['PositiveWords'])
    explain_result_hr['NegativeWords'] = ", ".join(explain_result_hr['NegativeWords'])
    explain_result_hr['Probability'] = "%.2f" % explain_result_hr['Probability']
    return {
        'Type': entryTypes['note'],
        'Contents': explain_result,
        'ContentsFormat': formats['json'],
        'HumanReadable': tableToMarkdown('DBot Predict Phishing Words', explain_result_hr,
                                         headers=['TextTokensHighlighted', 'Label', 'Probability',
                                                  'PositiveWords', 'NegativeWords'],
                                         removeNull=True),
        'HumanReadableFormat': formats['markdown'],
        'EntryContext': {
            'DBotPredictPhishingWords': explain_result
        }
    }
Пример #2
0
def predict_phishing_words(model_name, model_store_type, email_subject, email_body):
    model_data = get_model_data(model_name, model_store_type)
    model = demisto_ml.decode_model(model_data)
    text = "%s %s" % (email_subject, email_body)
    res = demisto.executeCommand('WordTokenizerNLP', {'value': text,
                                                      'hashWordWithSeed': demisto.args().get('hashSeed')})
    if is_error(res[0]):
        return_error(res[0]['Contents'])
    tokenized_text_result = res[0]['Contents']
    input_text = tokenized_text_result['hashedTokenizedText'] if tokenized_text_result.get('hashedTokenizedText') else \
        tokenized_text_result['tokenizedText']
    filtered_text, filtered_text_number_of_words = demisto_ml.filter_model_words(input_text, model)
    if filtered_text_number_of_words == 0:
        return_error("The model does not contains any of the input text words")

    explain_result = demisto_ml.explain_model_words(model,
                                                    input_text,
                                                    float(demisto.args().get('labelProbabilityThreshold', 0)),
                                                    float(demisto.args().get('wordThreshold', 0)),
                                                    int(demisto.args()['topWordsLimit']))

    if tokenized_text_result.get('hashedTokenizedText'):
        hash_word_to_plain = dict(
            zip(tokenized_text_result['hashedTokenizedText'].split(" "),
                tokenized_text_result['tokenizedText'].split(" ")))
        explain_result['PositiveWords'] = map(lambda x: hash_word_to_plain[x], explain_result['PositiveWords'])
        explain_result['NegativeWords'] = map(lambda x: hash_word_to_plain[x], explain_result['NegativeWords'])
    explain_result['OriginalText'] = tokenized_text_result['originalText']
    explain_result['TextTokensHighlighted'] = tokenized_text_result['tokenizedText']

    res = demisto.executeCommand('HighlightWords', {'text': tokenized_text_result['tokenizedText'],
                                                    'terms': ",".join(explain_result['PositiveWords'])})
    res = res[0]
    if not is_error(res):
        highlighted_text_markdown = res['Contents']
        explain_result['TextTokensHighlighted'] = highlighted_text_markdown

    return {
        'Type': entryTypes['note'],
        'Contents': explain_result,
        'ContentsFormat': formats['json'],
        'HumanReadable': tableToMarkdown('DBot Predict Phihsing Words', explain_result,
                                         headers=['TextTokensHighlighted', 'Label', 'Probability',
                                                  'PositiveWords', 'NegativeWords'],
                                         removeNull=True),
        'HumanReadableFormat': formats['markdown'],
        'EntryContext': {
            'DBotPredictPhishingWords': explain_result
        }
    }
def predict_phishing_words(model_name,
                           model_store_type,
                           email_subject,
                           email_body,
                           min_text_length,
                           label_threshold,
                           word_threshold,
                           top_word_limit,
                           is_return_error,
                           set_incidents_fields=False):
    model_data = get_model_data(model_name, model_store_type, is_return_error)
    model = demisto_ml.decode_model(model_data)
    text = "%s %s" % (email_subject, email_body)
    res = demisto.executeCommand(
        'WordTokenizerNLP', {
            'value': text,
            'hashWordWithSeed': demisto.args().get('hashSeed')
        })
    if is_error(res[0]):
        handle_error(res[0]['Contents'], is_return_error)
    tokenized_text_result = res[0]['Contents']
    input_text = tokenized_text_result['hashedTokenizedText'] if tokenized_text_result.get('hashedTokenizedText') else \
        tokenized_text_result['tokenizedText']
    filtered_text, filtered_text_number_of_words = demisto_ml.filter_model_words(
        input_text, model)
    if filtered_text_number_of_words == 0:
        handle_error("The model does not contain any of the input text words",
                     is_return_error)
    if filtered_text_number_of_words < min_text_length:
        handle_error(
            "The model contains fewer than %d words" % min_text_length,
            is_return_error)

    explain_result = demisto_ml.explain_model_words(model, input_text, 0,
                                                    word_threshold,
                                                    top_word_limit)
    predicted_prob = explain_result["Probability"]
    if predicted_prob < label_threshold:
        handle_error(
            "Label probability is {:.2f} and it's below the input confidence threshold"
            .format(predicted_prob), is_return_error)

    if tokenized_text_result.get('hashedTokenizedText'):
        words_to_token_maps = tokenized_text_result['wordsToHashedTokens']
    else:
        words_to_token_maps = tokenized_text_result['originalWordsToTokens']
    positive_tokens = set([
        ''.join(c for c in word if c.isalnum())
        for word in explain_result['PositiveWords']
    ])
    negative_tokens = set([
        ''.join(c for c in word if c.isalnum())
        for word in explain_result['NegativeWords']
    ])
    positive_words = find_words_contain_tokens(positive_tokens,
                                               words_to_token_maps)
    negative_words = find_words_contain_tokens(negative_tokens,
                                               words_to_token_maps)
    positive_words = [s.strip(punctuation) for s in positive_words]
    negative_words = [s.strip(punctuation) for s in negative_words]

    if len(positive_words) > 0:
        res = demisto.executeCommand(
            'HighlightWords', {
                'text': tokenized_text_result['originalText'],
                'terms': ",".join(positive_words)
            })
        res = res[0]
        if not is_error(res):
            highlighted_text_markdown = res['Contents']
        else:
            highlighted_text_markdown = tokenized_text_result[
                'originalText'].strip()
    else:
        highlighted_text_markdown = tokenized_text_result[
            'originalText'].strip()

    explain_result['PositiveWords'] = positive_words
    explain_result['NegativeWords'] = negative_words
    explain_result['OriginalText'] = tokenized_text_result[
        'originalText'].strip()
    explain_result['TextTokensHighlighted'] = highlighted_text_markdown
    predicted_label = explain_result["Label"]

    explain_result_hr = dict()
    explain_result_hr['TextTokensHighlighted'] = highlighted_text_markdown
    explain_result_hr['Label'] = predicted_label
    explain_result_hr['Probability'] = "%.2f" % predicted_prob
    explain_result_hr['Confidence'] = "%.2f" % predicted_prob
    explain_result_hr['PositiveWords'] = ", ".join(positive_words)
    explain_result_hr['NegativeWords'] = ", ".join(negative_words)
    incident_context = demisto.incidents()[0]
    if not incident_context['isPlayground'] and set_incidents_fields:
        demisto.executeCommand(
            "setIncident", {
                'dbotprediction': predicted_label,
                'dbotpredictionprobability': predicted_prob,
                'dbottextsuggestionhighlighted': highlighted_text_markdown
            })
    return {
        'Type':
        entryTypes['note'],
        'Contents':
        explain_result,
        'ContentsFormat':
        formats['json'],
        'HumanReadable':
        tableToMarkdown('DBot Predict Phishing Words',
                        explain_result_hr,
                        headers=[
                            'TextTokensHighlighted', 'Label', 'Confidence',
                            'PositiveWords', 'NegativeWords'
                        ],
                        removeNull=True),
        'HumanReadableFormat':
        formats['markdown'],
        'EntryContext': {
            'DBotPredictPhishingWords': explain_result
        }
    }
Пример #4
0
def predict_phishing_words(model_name, model_store_type, email_subject,
                           email_body):
    model_data = get_model_data(model_name, model_store_type)
    model = demisto_ml.decode_model(model_data)
    text = "%s %s" % (email_subject, email_body)
    res = demisto.executeCommand(
        'WordTokenizerNLP', {
            'value': text,
            'hashWordWithSeed': demisto.args().get('hashSeed')
        })
    if is_error(res[0]):
        return_error(res[0]['Contents'])
    tokenized_text_result = res[0]['Contents']
    input_text = tokenized_text_result['hashedTokenizedText'] if tokenized_text_result.get('hashedTokenizedText') else \
        tokenized_text_result['tokenizedText']
    filtered_text, filtered_text_number_of_words = demisto_ml.filter_model_words(
        input_text, model)
    if filtered_text_number_of_words == 0:
        return_error("The model does not contains any of the input text words")

    explain_result = demisto_ml.explain_model_words(
        model, input_text,
        float(demisto.args().get('labelProbabilityThreshold', 0)),
        float(demisto.args().get('wordThreshold', 0)),
        int(demisto.args()['topWordsLimit']))

    if tokenized_text_result.get('hashedTokenizedText'):
        hash_word_to_plain = dict(
            zip(tokenized_text_result['hashedTokenizedText'].split(" "),
                tokenized_text_result['tokenizedText'].split(" ")))
        explain_result['PositiveWords'] = map(lambda x: hash_word_to_plain[x],
                                              explain_result['PositiveWords'])
        explain_result['NegativeWords'] = map(lambda x: hash_word_to_plain[x],
                                              explain_result['NegativeWords'])
    explain_result['OriginalText'] = tokenized_text_result['originalText']
    explain_result['TextTokensHighlighted'] = tokenized_text_result[
        'tokenizedText']

    res = demisto.executeCommand(
        'HighlightWords', {
            'text': tokenized_text_result['tokenizedText'],
            'terms': ",".join(explain_result['PositiveWords'])
        })
    res = res[0]
    if not is_error(res):
        highlighted_text_markdown = res['Contents']
        explain_result['TextTokensHighlighted'] = highlighted_text_markdown

    return {
        'Type':
        entryTypes['note'],
        'Contents':
        explain_result,
        'ContentsFormat':
        formats['json'],
        'HumanReadable':
        tableToMarkdown('DBot Predict Phihsing Words',
                        explain_result,
                        headers=[
                            'TextTokensHighlighted', 'Label', 'Probability',
                            'PositiveWords', 'NegativeWords'
                        ],
                        removeNull=True),
        'HumanReadableFormat':
        formats['markdown'],
        'EntryContext': {
            'DBotPredictPhishingWords': explain_result
        }
    }