Exemplo n.º 1
0
def ingest_output_data(output_file, stemmer, tokenizer_str, stopword_str):
    """
    ingests the output data
    """
    with open(output_file, 'r', encoding='utf-8') as fs:
        lines = fs.readlines()

    for line in lines:
        parts = line.split('\t')
        condition = parts[0].lower()
        related_data = parts[0:]
        output_dict[condition] = related_data

        if 'whitespace' == tokenizer_str:
            tokens = tokenizer.whitespace_tokenize(
                condition.replace('(',
                                  '').replace(')',
                                              '').replace(':',
                                                          '').replace(',', ''),
                stemmer)
        elif 'nltk' == tokenizer_str:
            tokens = tokenizer.nltk_tokenize(condition)
        else:
            tokens = []

        if 'aggressive' == stopword_str:
            output_token_dict[condition] = stopword.remove_agressive_stopwords(
                tokens)
        elif 'nltk' == stopword_str:
            output_token_dict[condition] = stopword.remove_nltk_stopwords(
                tokens)
Exemplo n.º 2
0
def test_nltk_tokenize():
    """
        Test nltk tokenization on a standard sentence
    """
    expected_result = ['The', 'man', ',', 'Ivan', 'Ivanovich', ',', 'did', 'not', 'know',
                       'the', 'cat', "'s", 'eye', 'color', '.']
    assert tokenizer.nltk_tokenize(PHRASE_1) == expected_result
Exemplo n.º 3
0
def find_tokenized_variety(output_token_dict, conditions, threshold,
                           similarity_metric, stemmer, tokenizer_name):
    """
    search for most relevant matches to query based on tokenization

    :param output_token_dict: The pre-known list of conditions
    :param conditions: The list of conditions to consider
    :param threshold: The minimum similarity to retrain
    :param similarity_metric: Which similarity strategy to use
    :param stemmer: Which stemmer to use
    :param tokenizer_name: Which tokenizer to use
    :type output_token_dict: dict
    :type conditions: list
    :type threshold: float
    :type similarity_metric: str
    :type stemmer: str
    :type tokenizer_name: str
    :return: the top search matches
    :rtype: list
    """
    total_dict = {}
    for condition in conditions:
        condition = condition.strip().lower()
        if 'whitespace' == tokenizer_name:
            tokens = tokenizer.whitespace_tokenize(condition, stemmer)
        elif 'nltk' == tokenizer_name:
            tokens = tokenizer.nltk_tokenize(condition)
        else:
            tokens = []
        for item in output_token_dict:
            item_tokens = output_token_dict[item]
            # here we compare tokens and item_tokens
            if 'cosine' == similarity_metric:
                similarity = metrics.cosine_similarity(tokens, item_tokens)
            elif 'jaccard' == similarity_metric:
                similarity = metrics.jaccard_similarity(tokens, item_tokens)
            else:
                similarity = metrics.harmonic_similarity(tokens, item_tokens)
            if similarity > threshold:
                # add the similarity so that we can rank descending
                if item in total_dict:
                    if similarity > total_dict[item]:
                        total_dict[item] = similarity
                else:
                    total_dict[item] = similarity
                print(condition + ' -> ' + item)
                print(similarity)

    sorted_by_similarity = sorted(total_dict.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
    return sorted_by_similarity
Exemplo n.º 4
0
def test_nltk_tokenize_none():
    """
        Test nltk tokenization if None is received
    """
    expected_result = []
    assert tokenizer.nltk_tokenize(None) == expected_result
Exemplo n.º 5
0
def test_nltk_tokenize_empty_sentence():
    """
        Test nltk tokenization on an empty string
    """
    expected_result = []
    assert tokenizer.nltk_tokenize('') == expected_result