예제 #1
0
def find_sentence_anagrams1(sentence: str, anagram_dictionary: list, temp=None):
    """
    >>> d = ["Je", "suis", "Voldemort", "Veldomort", "Tom", "Jedusor", "Harry", "Potter", "Hermione", "Granger", "Ron"]
    >>> find_sentence_anagrams1("Tom Elvis Jedusor".lower(), [i.lower() for i in d])
    ['je', 'suis', 'voldemort']

    >>> d.extend(["ja", "sais"])
    >>> find_sentence_anagrams1("Tom Elvis Jedasor".lower(), [i.lower() for i in d])
    ['je', 'sais', 'voldemort']

    :param sentence: we want to find anagrams of this sentence
    :param anagram_dictionary:
    :param temp:
    :return: the first result found or None
    """
    if len(sentence) == 0:
        return sorted(temp)
    if temp is None:
        temp = set()
    key = "".join(sorted(sentence.replace(" ", "").lower()))
    for word in anagram_dictionary:
        if is_valid_subanagram(key, word.lower()):
            temp.add(word)
            anagram_dictionary.remove(word)
            sentence = "".join(Counter(key) - Counter(word))
            res = find_sentence_anagrams1(sentence, anagram_dictionary, temp)
            if res is not None:
                return res
    return None
예제 #2
0
def is_valid_subanagram(hashed_sentence, word):
    """
    >>> is_valid_subanagram(Counter("jesuisvoldemort"), Counter("voldemort"))
    True

    >>> is_valid_subanagram(Counter("jesuisvoldemort"), Counter("tromodlov"))
    False

    :param hashed_sentence:
    :param word:
    :return:
    """
    count_hashed_sentence = Counter(hashed_sentence)
    count_word = Counter(word)
    for c in set(word):
        if c not in hashed_sentence or count_word[c] > count_hashed_sentence[c]:
            return False
    return True
예제 #3
0
def find_sentence_anagrams2(sentence: str, anagram_dictionary: list, temp=None, res=None):
    """
    >>> d = ["Je", "suis", "Voldemort", "Veldomort", "Tom", "Jedusor", "Harry", "Potter", "Hermione", "Granger", "Ron"]
    >>> a = find_sentence_anagrams2("Tom Elvis Jedusor".lower(), [i.lower() for i in d])
    >>> a
    ['je', 'suis', 'voldemort']

    >>> d.extend(["ja", "sais"])
    >>> find_sentence_anagrams2("Tom Elvis Jedasor".lower(), [i.lower() for i in d])
    ['je', 'sais', 'voldemort']

    :param sentence: we want to find anagrams of this sentence
    :param anagram_dictionary:
    :param temp:
    :param res:
    :return: sorted result or None
    """
    # print(res)
    # print(temp
    # print(sentence)

    if temp is None:
        temp = ""
    if res is None:
        res = set()
    if len(sentence) == 0:
        # print(type(temp))
        return {temp}
    key = "".join(sorted(sentence.replace(" ", "").lower()))
    for word in anagram_dictionary:
        if is_valid_subanagram(key, word.lower()):
            if temp:
                temp = temp + " " + word
            else:
                temp = word
            anagram_dictionary.remove(word)
            sentence = "".join(Counter(key) - Counter(word))
            res.update(find_sentence_anagrams2(sentence, anagram_dictionary, temp, res))
    return res
def create_lexicon(pos, neg):
    lexicon = []
    for file_name in [pos, neg]:
        with open(file_name, 'r') as f:
            contents = f.read()
            for line in contents.split('\n'):
                data = line.strip('\n')
                if data:
                    all_words = word_tokenize(data)
                    lexicon += list(map((lambda x: x.lower()), all_words))
    lexicons = []
    for word in lexicon:
        if not word in stop_words:
            lexicons.append(word)
    word_counts = Counter(lexicons)  # it will return kind of dictionary
    l2 = []
    for word in word_counts:
        if 4000 > word_counts[word]:
            l2.append(word)
    return l2
예제 #5
0
def count_letters(string: str):
    """
    >>> count_letters("bonjour oui non")
    Counter({'o': 4, 'n': 3, 'u': 2, ' ': 2, 'b': 1, 'j': 1, 'r': 1, 'i': 1})
    """
    return Counter(string.replace(" ", ""))
예제 #6
0
#----------------get the dataset--------------------------------
data_main = get_data.Datasets()

#----------------get the vocab------------------------------------
train_reviews, train_sentences, train_tokens = data_main.get_normalized_data(
    "train")
unlabeled_reviews, unlabeled_sentences, unlabeled_tokens = data_main.get_normalized_data(
    "unlabeled")
test_reviews, test_sentences, test_tokens = data_main.get_normalized_data(
    "test")

all_cleaned_reviews = train_reviews + unlabeled_reviews + test_reviews

#create a counter
all_tokens = train_tokens + test_tokens + unlabeled_tokens
counter = Counter()

for tokens in all_tokens:
    counter.update(tokens)
print(len(counter))

common_keys = get_most_common_vocab(MAX_NB_WORDS, counter)
print("common keys")
print(common_keys)

#load the vocab
vocab_name = "tuning_vocab_txt"
text = load_vocab(vocab_name)
slices = set(text.split())
vocabulary = create_vocab_dict(slices)
print(vocabulary)
예제 #7
0
    data = []

    arr = nltk.word_tokenize(res)
    result = []
    for index in range(0, arr.__len__()):
        if(arr[index] == "<"):
            btw = arr[index] + arr[index+1] + arr[index+2]
            result.append(btw)
        elif arr[index] == ">" or arr[index] == "s" or arr[index] == "f":
            continue
        else:
            result.append(arr[index])

    data.append(result)

    frequencies = Counter([])
    freq_bi = Counter([])
    freq_tr = Counter([])
    freq_fo = Counter([])
    freq_fi = Counter([])



    unigram = ngrams(result, 1)
    bigrams = ngrams(result, 2)
    trigrams = ngrams(result, 3)
    fourgrams = ngrams(result, 4)
    fivegrams = ngrams(result, 5)

    frequencies+=Counter(unigram)
    freq_bi+=Counter(bigrams)