def get_words_by_content(content): words = [] tokenizer = SpaceTokenizer() words += tokenizer.tokenize(content) ##words = list(set(words)) # words = frozenset(words) return words
def generate(sents, c, dictionary): ''' args : npndarray of strings return : context-target pair as list ''' tokenizer = SpaceTokenizer() xs = [] ys = [] for sent in sents: sent = tokenizer.tokenize(sent) start = c end = len(sent) for i in range(start, end - c): context = [] for j in range(-c, c + 1): if j == 0: pass else: context.append(dictionary.word2idx[sent[i + j]]) xs.append(context) ys.append(dictionary.word2idx[sent[i]]) x = np.vstack(xs) y = np.vstack(ys) return x, y
class GraphBuilder_Bag_Of_Words(GraphBuilder): """The edge between two authors is the jaccard similarity between the bag of words of each author""" def __init__(self, db): GraphBuilder.__init__(self, db) if self._domain == u'Microblog': self._tokenizer = TweetTokenizer() else: self._tokenizer = SpaceTokenizer() def execute(self, window_start=None): start_time = time.time() logging.info("execute started for " + self.__class__.__name__ + " started at " + str(start_time)) logging.info("getting posts from DB ") if self._num_of_random_authors_for_graph is None: posts_by_domain = self._db.get_author_posts_dict_by_minimal_num_of_posts( self._domain, self._min_number_of_posts_per_author) else: # if not self._are_already_randomize_authors_for_graphs(): # self._db.randomize_authors_for_graph(self._min_number_of_posts_per_author, self._domain, self._num_of_random_authors_for_graph) posts_by_domain = self._db.get_random_author_posts_dict_by_minimal_num_of_posts( ) all_authors_count = len(posts_by_domain.keys()) total_combinations = (all_authors_count * (all_authors_count - 1)) / 2 current = 0 # Dictionary: key = author_guid, value = list of posts bag_of_words_per_author = {} for author, posts in posts_by_domain.iteritems(): bow = [] for post in posts: content = post.content if content is not None: bow += self._tokenizer.tokenize(content) bag_of_words_per_author[author] = frozenset(bow) current += 1 if current % 10000 == 0: print('\r done author ' + str(current) + ' out of ' + str(all_authors_count), end='') logging.info("done computing bag of words ") all_pairs = combinations(bag_of_words_per_author.keys(), 2) """ Casting all_pairs to an iterable object (frozenset) is NOT a good idea since combinations function returns a generator object, which is more memory and CPU efficient than iterable objects """ logging.info("computing similarity between bag of words ") author_connections = [] current = 0 for author_a, author_b in all_pairs: weight = self.compute_jaccard_index( bag_of_words_per_author[author_a], bag_of_words_per_author[author_b]) author_connections.append((author_a, author_b, weight)) current += 1 if current % 10000 == 0: print('\r done pair ' + str(current) + ' out of ' + str(total_combinations), end='') self._fill_author_connections(author_connections) author_connections = [] self._fill_author_connections(author_connections) end_time = time.time() duration = end_time - start_time logging.info(" total time taken " + str(duration)) def compute_jaccard_index(self, set_1, set_2): n = len(set_1.intersection(set_2)) return n / float(len(set_1) + len(set_2) - n)
def get_words_by_content(content): words = [] tokenizer = SpaceTokenizer() words += tokenizer.tokenize(content) return words
total_loss = 0 if __name__ == "__main__": tokenizer = SpaceTokenizer() normalize_corpus = np.vectorize(normalize) raw = gutenberg.sents('bible-kjv.txt') start_time = time.time() norm = normalize_corpus(raw[:100]) elapsed = time.time() - start_time # fill out dictionary dictionary = Dictionary() for sent in norm: words = tokenizer.tokenize(sent) for word in words: dictionary.add_word(word) ''' print("length of dict: ", len(dictionary)) print("word2idx: ", dictionary.word2idx) print("idx2dict: ", dictionary.idx2word) ''' # generate pairs start_time = time.time() pairs = generate(norm, 2, dictionary) elapsed = time.time() - start_time x, y = batchify(pairs) print(x[:10])
def main(): USER = "******" PASSWD = "089567" HOST = "localhost" CHARSET = "utf8mb4" DATABASE = "music" COLLATE = "utf8mb4_unicode_ci" #PORT = "3306" PATH = './' PATH_U = './users' x = sql_worker(USER, PASSWD, HOST, CHARSET, DATABASE, COLLATE) print("Соединение", x.connect()) print("Создание", x.create()) print("Использование", x.use(DATABASE)) print("Ожидайте, обработка базы") dlalbom = [] dlpesnya = [] textsong = [] artist = (x.select("select id,name from wc_lyricsnet_artists limit 1000")) for elem in artist: album = (x.select("select `id`,`name` from `wc_lyricsnet_albums` where `artist_id` = '%s' and `year` = 1995;" % (elem[0]))) for each in album: song = (x.select("select `title` from `wc_lyricsnet_songs` where `artist_id` = '%s' and `album_id` = '%s';" % (elem[0], each[0]))) for ef in song: ef = list(ef) ef = ''.join(map(str, ef)) textsong.append(ef) #csvwww.write(str(elem[1]) + "\t" + str(each[1]) + "\t" + str(ef) + "\n") #создание словаря #print(elem[1]) #print(each[1]) #print(ef) dlinaalbom = len(str(each[1])) dlinasong = len(str(ef)) dlalbom.append(dlinaalbom) dlpesnya.append(dlinasong) xx = np.array(dlalbom) yy = np.array(dlpesnya) pylab.plot(yy, xx, 'r') pylab.xlabel('Количество символов в названии песни') pylab.ylabel('Количество символов в названии альбома') pylab.title('Соотношение длины названий (в символах)') pylab.show() correl = np.corrcoef(yy, xx) print("Корреляция длины названий песен и альбомов") print("Для продолжения закройте диаграмму") print("###") print(correl) plt.scatter(yy, xx) plt.show() print("Корреляция Пирсона", scipy.stats.pearsonr(yy, xx)) print("\n") #LDA texts = [] tokentoken = SpaceTokenizer() #токенизация en_stop = get_stop_words('en') #стоп-слова p_stemmer = PorterStemmer() #стемминг for i in textsong: raw = i.lower() tokens = tokentoken.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dic = corpora.Dictionary(texts) corps = [dic.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corps, num_topics = 7, id2word = dic, passes = 20) try: plt.style.use('ggplot') except: pass print("Результат работы LDA:") print("###") print(ldamodel.print_topics(num_topics = 7, num_words = 3)) print("\n") allwords = [] for elements in texts: for allell in elements: allwords.append(allell) stoplist = set('for a of the and to in'.split()) #LSI texts = [[word for word in document.lower().split() if word not in stoplist] for document in allwords] alltokens = sum(texts, []) tokens1 = set(word for word in set(alltokens) if alltokens.count(word) == 1) texts = [[word for word in text if word not in tokens1] for text in texts] corp = [dic.doc2bow(text) for text in texts] lsi = models.lsimodel.LsiModel(corpus = corp, id2word = dic, num_topics = 3) print("Результат работы LSI:") print("###") print(lsi.print_topics(3)) slovnik = {} with open("csvdict1.csv", 'r') as f: reader = csv.reader(f, delimiter = "\t") for row in reader: kluch1 = row[2].split(" ") for elemk in kluch1: kluch = elemk kluch = str(kluch).lower() if kluch in slovnik: znach = slovnik[kluch] slovnik[kluch] = znach + 1 else: slovnik[kluch] = 1 stopslova = "a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves" sortklucha = sorted(slovnik, key = lambda x: int(slovnik[x]), reverse = True) zapis = open("slovnik.csv",'w') zapis2 = open("slovnikstopwords.csv", "w") zapis.write("Word" + "\t" + "Frequency" + "\n") zapis2.write("Word" + "\t" + "Frequency" + "\n") try: for kluch in sortklucha: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь") finally: zapis.close() try: for kluch in sortklucha: if kluch in stopslova: pass else: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь со стоп-словами") finally: zapis2.close()
def main(): USER = "******" PASSWD = "089567" HOST = "localhost" CHARSET = "utf8mb4" DATABASE = "music" COLLATE = "utf8mb4_unicode_ci" #PORT = "3306" PATH = './' PATH_U = './users' x = sql_worker(USER, PASSWD, HOST, CHARSET, DATABASE, COLLATE) print("Соединение", x.connect()) print("Создание", x.create()) print("Использование", x.use(DATABASE)) print("Ожидайте, обработка базы") dlalbom = [] dlpesnya = [] textsong = [] artist = (x.select("select id,name from wc_lyricsnet_artists limit 1000")) for elem in artist: album = (x.select( "select `id`,`name` from `wc_lyricsnet_albums` where `artist_id` = '%s' and `year` = 1995;" % (elem[0]))) for each in album: song = (x.select( "select `title` from `wc_lyricsnet_songs` where `artist_id` = '%s' and `album_id` = '%s';" % (elem[0], each[0]))) for ef in song: ef = list(ef) ef = ''.join(map(str, ef)) textsong.append(ef) #csvwww.write(str(elem[1]) + "\t" + str(each[1]) + "\t" + str(ef) + "\n") #создание словаря #print(elem[1]) #print(each[1]) #print(ef) dlinaalbom = len(str(each[1])) dlinasong = len(str(ef)) dlalbom.append(dlinaalbom) dlpesnya.append(dlinasong) xx = np.array(dlalbom) yy = np.array(dlpesnya) pylab.plot(yy, xx, 'r') pylab.xlabel('Количество символов в названии песни') pylab.ylabel('Количество символов в названии альбома') pylab.title('Соотношение длины названий (в символах)') pylab.show() correl = np.corrcoef(yy, xx) print("Корреляция длины названий песен и альбомов") print("Для продолжения закройте диаграмму") print("###") print(correl) plt.scatter(yy, xx) plt.show() print("Корреляция Пирсона", scipy.stats.pearsonr(yy, xx)) print("\n") #LDA texts = [] tokentoken = SpaceTokenizer() #токенизация en_stop = get_stop_words('en') #стоп-слова p_stemmer = PorterStemmer() #стемминг for i in textsong: raw = i.lower() tokens = tokentoken.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dic = corpora.Dictionary(texts) corps = [dic.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corps, num_topics=7, id2word=dic, passes=20) try: plt.style.use('ggplot') except: pass print("Результат работы LDA:") print("###") print(ldamodel.print_topics(num_topics=7, num_words=3)) print("\n") allwords = [] for elements in texts: for allell in elements: allwords.append(allell) stoplist = set('for a of the and to in'.split()) #LSI texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in allwords] alltokens = sum(texts, []) tokens1 = set(word for word in set(alltokens) if alltokens.count(word) == 1) texts = [[word for word in text if word not in tokens1] for text in texts] corp = [dic.doc2bow(text) for text in texts] lsi = models.lsimodel.LsiModel(corpus=corp, id2word=dic, num_topics=3) print("Результат работы LSI:") print("###") print(lsi.print_topics(3)) slovnik = {} with open("csvdict1.csv", 'r') as f: reader = csv.reader(f, delimiter="\t") for row in reader: kluch1 = row[2].split(" ") for elemk in kluch1: kluch = elemk kluch = str(kluch).lower() if kluch in slovnik: znach = slovnik[kluch] slovnik[kluch] = znach + 1 else: slovnik[kluch] = 1 stopslova = "a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves" sortklucha = sorted(slovnik, key=lambda x: int(slovnik[x]), reverse=True) zapis = open("slovnik.csv", 'w') zapis2 = open("slovnikstopwords.csv", "w") zapis.write("Word" + "\t" + "Frequency" + "\n") zapis2.write("Word" + "\t" + "Frequency" + "\n") try: for kluch in sortklucha: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь") finally: zapis.close() try: for kluch in sortklucha: if kluch in stopslova: pass else: jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch]) zapis2.write(jjj) print("Создан частотный словарь со стоп-словами") finally: zapis2.close()
class Bag_Of_Words_Graph_Builder(GraphBuilder): """The edge between two authors is the jaccard similarity between the bag of words of each author""" def __init__(self, db): GraphBuilder.__init__(self, db) self._author_guid_posts_dict = {} self._author_guid_bag_of_words_dict = {} self._word_dict = {} if self._domain == u'Microblog': self._tokenizer = TweetTokenizer() else: self._tokenizer = SpaceTokenizer() def execute(self, window_start=None): pass def fill_author_guid_posts_dictionary(self): self._author_guid_posts_dict = self._db.get_author_posts_dict_by_minimal_num_of_posts( self._domain, self._min_number_of_posts_per_author) def fill_author_guid_bag_of_words_dictionary(self): all_authors_count = len(self._author_guid_posts_dict.keys()) i = 0 # Dictionary: key = author_guid, value = list of posts for author_guid, posts in self._author_guid_posts_dict.iteritems(): bow = [] for post in posts: content = post.content content = content.lower() content = re.sub(r'http\S+', '', content) if content is not None: bow += self._tokenizer.tokenize(content) bow = list(frozenset(bow)) self._author_guid_bag_of_words_dict[author_guid] = bow for word in bow: if word not in self._word_dict: self._word_dict[word] = word i += 1 if i % 100000 == 0: print('\r done author ' + str(i) + ' out of ' + str(all_authors_count), end='') logging.info("done computing bag of words ") def compute_jaccard_index(self, set_1, set_2): n = len(set_1.intersection(set_2)) return n / float(len(set_1) + len(set_2) - n) def fill_author_guid_bag_of_words_dictionary_and_calculate_all_combinations( self): self.fill_author_guid_bag_of_words_dictionary() author_guids = self._author_guid_bag_of_words_dict.keys() all_authors_count = len(author_guids) all_pairs = combinations(author_guids, 2) total_combinations = (all_authors_count * (all_authors_count - 1)) / 2 """ Casting all_pairs to an iterable object (frozenset) is NOT a good idea since combinations function returns a generator object, which is more memory and CPU efficient than iterable objects """ logging.info("computing similarity between bag of words ") i = 0 for author_guid_1, author_guid_2 in all_pairs: i += 1 print('\r calculating pairs of authors : {0}/{1}'.format( i, total_combinations), end='') author_guid_1_bag_of_words = self._author_guid_bag_of_words_dict[ author_guid_1] author_guid_2_bag_of_words = self._author_guid_bag_of_words_dict[ author_guid_2] self.calculate_jaccard_index_create_and_save_connection( author_guid_1, author_guid_2, author_guid_1_bag_of_words, author_guid_2_bag_of_words) self._db.save_author_connections(self._author_connections_edges) def calculate_jaccard_index_create_and_save_connection( self, author_guid_1, author_guid_2, author_guid_1_bag_of_words, author_guid_2_bag_of_words): weight = self.compute_jaccard_index(set(author_guid_1_bag_of_words), set(author_guid_2_bag_of_words)) self._create_and_optional_save_connection(author_guid_1, author_guid_2, weight)