def grammar_dependencies_count(headlines, bodies):
    parser = spacy.load('en')

    grammar_counts = {}
    print("starting parser")
    # tagsDict = {k: v for v, k in enumerate(parser.pipe_labels['parser'])}
    tagsDict = parser.pipe_labels['parser']

    for i, doc in enumerate(parser.pipe(bodies, batch_size=1000, n_threads=4)):
        counts = collections.Counter()
        for w in doc:
            counts[w.dep_] += 1
        ssum = sum(counts.values())
        for k, v in counts.items():
            counts[k] = (counts[k] / ssum)
        grammar_counts[i] = counts
    rv = list(range(len(bodies)))
    print("starting lists")
    for i, b in tqdm(enumerate(bodies)):
        try:
            rv[i] = []
            for k in tagsDict:
                if grammar_counts[i].keys().__contains__(k):
                    rv[i].append(grammar_counts[i][k])
                else:
                    rv[i].append(0)
        except Exception as e:
            # Ocassionally the way Spacey processes unusual characters (bullet points, em dashes) will cause the lookup based on the original characters to fail.
            # In that case, just set to None.
            print("Error in GrammarTransformer, setting to None")
            # print(text)
            rv[i] = {}
            continue

    return rv
Пример #2
0
def wiki_test(request, word1, word2):
    page = requests.get("https://en.wikipedia.org/wiki/" + word1)
    soup = BeautifulSoup(page.content, features="html.parser")

    word_tokenized1 = []
    size = len(soup.find_all('p'))
    content = soup.find_all('p')
    for i in range(size):
        word_tokenized1.append(word_tokenize(content[i].get_text()))

    filtered_words1 = []
    for list in word_tokenized1:
        for word in list:
            if word not in stopwords.words('english'):
                if word.isalpha():
                    filtered_words1.append(word)

    most_common_words1 = (collections.Counter(filtered_words1).most_common(100))

#---------------------------------------------------------
    page = requests.get("https://en.wikipedia.org/wiki/" + word2)
    soup = BeautifulSoup(page.content, features="html.parser")

    word_tokenized2 = []
    size = len(soup.find_all('p'))
    content = soup.find_all('p')
    for i in range(size):
        word_tokenized2.append(word_tokenize(content[i].get_text()))

    filtered_words2 = []
    for list in word_tokenized2:
        for word in list:
            if word not in stopwords.words('english'):
                if word.isalpha():
                    filtered_words2.append(word)

    most_common_words2 = (collections.Counter(filtered_words2).most_common(100))

    len2 = 0
    i = 0
    x = 0
    for w, c in most_common_words2:
        len2 += c
        if w == word1:
            x = i
    i += 1
    return JsonResponse({'s': (most_common_words2[x][1] / len2) * 100})
Пример #3
0
def contexts(input, mode, keyword):
    sys.stdout = open(
        './01_out/' + input.split('_')[0] + '_cont_' + keyword + '_' + mode +
        '.txt', 'w')
    f = open('./00_data/' + input, encoding=mode)
    token = sent_tokenize(f.read())
    f.close()
    data = [i.split(' ') for i in token]
    list = []
    for i in data:
        for j in range(1, len(i) - 1):
            if i[j].lower() == keyword.lower():
                list.append(i[j - 1] + '_' + i[j + 1])
    print(collections.Counter(list))
Пример #4
0
def get_word_counts(config, from_timestamp, to_timestamp, count):

    posts = get_posts_from_range(config, from_timestamp, to_timestamp)
    wordcount = {}
    for post in posts:
        words = get_cleaned_text(post['text']).split()
        for word in words:
            if word not in wordcount:
                wordcount[word] = 1
            else:
                wordcount[word] += 1

    word_counter = collections.Counter(wordcount)
    words = []
    for word in word_counter.most_common(count):
        words.append({'word': word[0], 'count': word[1]})
    return words
Пример #5
0
def build_dictionary(sentences, vocabulary_size):
    # 把句子转换成单词列表
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]

    # count里面存放的内容 [word, word_count],存放的第一个[单词,单词数]对是[未知单词,-1]
    count = [('RARE', -1)]

    # 使用nltk.collections提供的工具统计单词出现的频率
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))

    # 将单词与索引对应
    # ToKnown:原作者使用单词序列长度累加的方法获得索引,速度慢而且不容易理解,我修改为直接枚举
    word_dict = {x: i for i, (x, _) in enumerate(count)}
    # word_dict = {}
    # for word, word_count in count:
    #     word_dict[word] = len(word_dict)

    return word_dict
Пример #6
0
    def calculate_and_store_types(self):
        for tweet in self.tweets:
            text = nltk.word_tokenize(tweet)
            liste = nltk.pos_tag(text)
            for word_and_type in liste:
                self.types.append(word_and_type[1])

        self.types_and_numbers_list = list(collections.Counter(self.types).items())

        feature_list = ['VBZ', ',', 'CD', 'JJS', 'WDT', 'VBP', '#', 'PRP$', 'JJR']
        #feature_list = [',', 'CD',   'VBP', '#', 'PRP$']

        for j in feature_list:
            self.func(j)

        self.valid_types_and_numbers.sort(key=lambda x: x[0])

        for i in self.valid_types_and_numbers:
            self.features_array.append(i[1])

        ############################################# KENDİ ÇIKARDIĞIM KELİMELERİN FEATURE OLARAK EKLENMESİ

        # avg = int(sum(self.features_array) / len(self.features_array))

        for word in commonly_used_words_by_males:
            if word in self.words:
                self.features_array.append(Counter(self.words).get(word))
            else:
                self.features_array.append(0)

        for word in words_of_technology:
            if word in self.words:
                self.features_array.append(Counter(self.words).get(word))  #
            else:
                self.features_array.append(0)  #

        for word in commonly_used_words_by_females:
            if word in self.words:
                self.features_array.append(Counter(self.words).get(word))  # okay
            else:
                self.features_array.append(0)
Пример #7
0
    def sentence_vector(self, sentence: str, sample=None) -> ndarray:
        tokens = [
            token for token in tokenizer.tokenize(sentence)
            if token.isprintable() and token not in _stop_words
        ]

        if sample:
            tokens = list(everygrams(tokens, 1,
                                     1))  # .sort(key=lambda x: x[1])[:10]
            tokens = [
                word[0] for word, count in collections.Counter(
                    tokens).most_common(sample)
            ]
        key = hashlib.md5(sentence.encode('utf-8')).hexdigest()
        if self.redis is not None and self.redis.exists(key):
            vector = Embeddings.load_vector_from_cache(key, self.redis)
        else:
            vector = self.arithmetic_mean_bow_embedding(tokens)
            if self.redis is not None:
                Embeddings.cache_vector(key, vector, self.redis)

        return vector
Пример #8
0
    total = vocabsize['aggregations']['counts']['value']
    print("%s\t%s\t%s\t%s" % (genre['key'], unq / total, total, unq))

for genre in res['aggregations']['genres']['buckets']:
    curr_gnere = genre["key"]
    genre_songs_list = []

    songs_by_genre = {
        "query": {
            "match": {
                "album.genre": curr_gnere
            }
        },
        "size": NUM_OF_SONGS
    }
    res = es.search(index=ES_INDEX, doc_type=ES_TYPE, body=songs_by_genre)
    for song in res["hits"]["hits"]:
        lyrics = song["_source"]["lyrics"]
        tokens = nltk.word_tokenize(lyrics)
        filtered_tokens = [
            token for token in tokens
            if token not in stopwords.words('english')
        ]
        tokens_counter = collections.Counter(filtered_tokens)
        tuple_list = list(tokens_counter.items())
        genre_songs_list.append(tuple_list)

    with open(
            '/home/omri/Dev/Python/IntroToDS/Data/pickle_list/' + curr_gnere +
            '.pickle', 'wb') as handle:
        pickle.dump(genre_songs_list, handle, protocol=pickle.HIGHEST_PROTOCOL)
Пример #9
0
    def generator(self, spell, sentence, call):
        if (call == 5):
            return sentence
        if (sentence == "" and self.n > 1):

            index = random.randrange(len(self.smootedTable) - 1)
            heceler = list(list(self.smootedTable.items())[index])[0]

            sentence = ''.join([str(elem) for elem in heceler])
            newSpell = []
            if self.n > 4:
                newSpell.append(heceler[len(heceler) - 4])
            if self.n > 3:
                newSpell.append(heceler[len(heceler) - 3])
            if self.n > 2:
                newSpell.append(heceler[len(heceler) - 2])
            newSpell.append(heceler[len(heceler) - 1])
            call += 1

            return self.generator(newSpell, sentence, call)
        elif self.n == 1:
            founded = collections.Counter(self.smootedTable).most_common(5)
            for i in founded:
                sentence += i[0]
            return sentence

        else:
            newSpell = []

            founded = dict()

            if self.n == 2:
                for key in self.smootedTable:
                    if key[0] == spell[0]:
                        founded[key] = self.smootedTable[key]
            elif self.n == 3:
                for key in self.smootedTable:
                    if key[0] == spell[0] and key[1] == spell[1]:
                        founded[key] = self.smootedTable[key]
            elif self.n == 4:
                for key in self.smootedTable:
                    if key[0] == spell[0] and key[1] == spell[1] and key[
                            2] == spell[2]:
                        founded[key] = self.smootedTable[key]
            else:
                for key in self.smootedTable:
                    if key[0] == spell[0] and key[1] == spell[1] and key[
                            2] == spell[2] and key[3] == spell[3]:
                        founded[key] = self.smootedTable[key]

            founded = collections.Counter(founded).most_common(1)

            if len(founded) == 0:
                return sentence

            found = founded[0]

            start = self.n - 1

            sentence += str(found[0][len(found[0]) - 1])
            if self.n > 4:
                newSpell.append(found[0][len(found[0]) - 4])
            if self.n > 3:
                newSpell.append(found[0][len(found[0]) - 3])
            if self.n > 2:
                newSpell.append(found[0][len(found[0]) - 2])
            newSpell.append(found[0][len(found[0]) - 1])

            call += 1
            return self.generator(newSpell, sentence, call)
Пример #10
0
 def setNGramTable(self):
     self.nGramTable = collections.Counter(self.nGram)
Пример #11
0
 def topKFrequent2(self, nums: List[int], k: int) -> List[int]:
     from nltk import collections
     counts = collections.Counter(nums)
     print(counts.keys())
     return heapq.nlargest(k, counts.keys(), key=counts.get)
Пример #12
0
df['token'] = df['token'].apply(
    lambda x: [item for item in x if item not in sw])

df['tokenstring'] = [' '.join(map(str, l)) for l in df['token']]

lists = df['token']
row_list = []
no_of_lists_per_name = Counter(chain.from_iterable(map(set, lists)))
for name, no_of_lists in no_of_lists_per_name.most_common():
    if no_of_lists == 1:
        break  # since it is ordered by count, once we get this low we are done
    row_list.append([name, no_of_lists])
df_cat_1 = pd.DataFrame(row_list, columns=['cat', 'cat_count'])

#print(Counter(list(ngrams(df['token'], 2))))
counts = collections.Counter()  # or nltk.FreqDist()
#for sent in df['token']:
#    counts.update(nltk.ngrams(sent, 2))

#print(counts)

for sent in df['token']:
    counts.update(" ".join(n) for n in nltk.ngrams(sent, 2))
df_cat_2 = pd.DataFrame.from_records(counts.most_common(),
                                     columns=['cat', 'cat_count'])
df_cat = df_cat_1.append(df_cat_2)
df = pd.merge(df, df_cat, left_on=["tokenstring"], right_on="cat")

df.to_csv('temp.csv', encoding='utf-8-sig', index=False)

df = df.groupby(['category']).agg({
Пример #13
0
def find_tf_string(token_list):
    weight_dict = collections.Counter(token_list)
    tf_dict = {}
    for word, weight in zip(weight_dict.keys(), weight_dict.values()):
        tf_dict[word] = (weight / len(token_list))
    return tf_dict
Пример #14
0
print("Topic Stats")
print(get_data_frame_stats(df))

train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train.text.apply(lambda text: get_cleaned_text(text))
X_test = test.text.apply(lambda text: get_cleaned_text(text))

wordcount = {}
for text in X_train.append(X_test):
    for word in text.split():
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1

word_counter = collections.Counter(wordcount)
n_print = int(input("How many most common words to print: "))
for word, count in word_counter.most_common(n_print):
    print(count, word)
# MultiLabelBinarizer().fit_transform(train)

start = time.time()

train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)
X_train = train.text.apply(lambda text: get_cleaned_text(text))
X_test = test.text.apply(lambda text: get_cleaned_text(text))

NB_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    # ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
    # ('clf', LinearSVC()),