예제 #1
0
def get_words_by_content(content):
    words = []
    tokenizer = SpaceTokenizer()
    words += tokenizer.tokenize(content)
    ##words = list(set(words))
    # words = frozenset(words)
    return words
예제 #2
0
def generate(sents, c, dictionary):
    '''
    args : npndarray of strings
    return : context-target pair as list
    '''

    tokenizer = SpaceTokenizer()

    xs = []
    ys = []

    for sent in sents:
        sent = tokenizer.tokenize(sent)
        start = c
        end = len(sent)

        for i in range(start, end - c):
            context = []
            for j in range(-c, c + 1):
                if j == 0: pass
                else:
                    context.append(dictionary.word2idx[sent[i + j]])

            xs.append(context)
            ys.append(dictionary.word2idx[sent[i]])

    x = np.vstack(xs)
    y = np.vstack(ys)

    return x, y
예제 #3
0
class GraphBuilder_Bag_Of_Words(GraphBuilder):
    """The edge between two authors is the jaccard similarity between the bag of words of each author"""
    def __init__(self, db):
        GraphBuilder.__init__(self, db)

        if self._domain == u'Microblog':
            self._tokenizer = TweetTokenizer()
        else:
            self._tokenizer = SpaceTokenizer()

    def execute(self, window_start=None):
        start_time = time.time()
        logging.info("execute started for " + self.__class__.__name__ +
                     " started at " + str(start_time))
        logging.info("getting posts from DB ")

        if self._num_of_random_authors_for_graph is None:
            posts_by_domain = self._db.get_author_posts_dict_by_minimal_num_of_posts(
                self._domain, self._min_number_of_posts_per_author)
        else:
            # if not self._are_already_randomize_authors_for_graphs():
            #    self._db.randomize_authors_for_graph(self._min_number_of_posts_per_author, self._domain, self._num_of_random_authors_for_graph)
            posts_by_domain = self._db.get_random_author_posts_dict_by_minimal_num_of_posts(
            )

        all_authors_count = len(posts_by_domain.keys())
        total_combinations = (all_authors_count * (all_authors_count - 1)) / 2
        current = 0
        # Dictionary: key = author_guid, value = list of posts
        bag_of_words_per_author = {}

        for author, posts in posts_by_domain.iteritems():
            bow = []
            for post in posts:
                content = post.content
                if content is not None:
                    bow += self._tokenizer.tokenize(content)
            bag_of_words_per_author[author] = frozenset(bow)
            current += 1
            if current % 10000 == 0:
                print('\r done author ' + str(current) + ' out of ' +
                      str(all_authors_count),
                      end='')
        logging.info("done computing bag of words ")
        all_pairs = combinations(bag_of_words_per_author.keys(), 2)
        """
        Casting all_pairs to an iterable object (frozenset) is NOT a good idea since combinations function returns a generator object,
        which is more memory and CPU efficient than iterable objects
        """
        logging.info("computing similarity between bag of words ")
        author_connections = []

        current = 0
        for author_a, author_b in all_pairs:
            weight = self.compute_jaccard_index(
                bag_of_words_per_author[author_a],
                bag_of_words_per_author[author_b])
            author_connections.append((author_a, author_b, weight))
            current += 1
            if current % 10000 == 0:
                print('\r done pair ' + str(current) + ' out of ' +
                      str(total_combinations),
                      end='')
                self._fill_author_connections(author_connections)
                author_connections = []
        self._fill_author_connections(author_connections)
        end_time = time.time()
        duration = end_time - start_time
        logging.info(" total time taken " + str(duration))

    def compute_jaccard_index(self, set_1, set_2):
        n = len(set_1.intersection(set_2))
        return n / float(len(set_1) + len(set_2) - n)
예제 #4
0
def get_words_by_content(content):
    words = []
    tokenizer = SpaceTokenizer()
    words += tokenizer.tokenize(content)

    return words
예제 #5
0
            total_loss = 0


if __name__ == "__main__":
    tokenizer = SpaceTokenizer()
    normalize_corpus = np.vectorize(normalize)
    raw = gutenberg.sents('bible-kjv.txt')

    start_time = time.time()
    norm = normalize_corpus(raw[:100])
    elapsed = time.time() - start_time

    # fill out dictionary
    dictionary = Dictionary()
    for sent in norm:
        words = tokenizer.tokenize(sent)
        for word in words:
            dictionary.add_word(word)
    '''
    print("length of dict: ", len(dictionary))
    print("word2idx: ", dictionary.word2idx)
    print("idx2dict: ", dictionary.idx2word)  
    '''

    # generate pairs
    start_time = time.time()
    pairs = generate(norm, 2, dictionary)
    elapsed = time.time() - start_time

    x, y = batchify(pairs)
    print(x[:10])
예제 #6
0
파일: main.py 프로젝트: haniani/DH
def main():
    USER = "******"
    PASSWD = "089567"
    HOST = "localhost"
    CHARSET = "utf8mb4"
    DATABASE = "music"
    COLLATE = "utf8mb4_unicode_ci"
    #PORT = "3306"
    PATH = './'
    PATH_U = './users'

    x = sql_worker(USER, PASSWD, HOST, CHARSET, DATABASE, COLLATE)
    print("Соединение", x.connect())
    print("Создание", x.create())
    print("Использование", x.use(DATABASE))
    print("Ожидайте, обработка базы")

    dlalbom = []
    dlpesnya = []
    textsong = []
    artist = (x.select("select id,name from wc_lyricsnet_artists limit 1000"))
    for elem in artist:
        album = (x.select("select `id`,`name` from `wc_lyricsnet_albums` where `artist_id` = '%s' and `year` = 1995;" % (elem[0])))
        for each in album:
            song = (x.select("select `title` from `wc_lyricsnet_songs` where `artist_id` = '%s' and `album_id` = '%s';" % (elem[0], each[0])))
            for ef in song:
                ef = list(ef)
                ef = ''.join(map(str, ef))
                textsong.append(ef)
                #csvwww.write(str(elem[1]) + "\t" + str(each[1]) + "\t" + str(ef) + "\n") #создание словаря
                #print(elem[1])
                #print(each[1])
                #print(ef)
                dlinaalbom = len(str(each[1]))
                dlinasong = len(str(ef))
                dlalbom.append(dlinaalbom)
                dlpesnya.append(dlinasong)
         

    xx = np.array(dlalbom)
    yy = np.array(dlpesnya)
    pylab.plot(yy, xx, 'r')
    pylab.xlabel('Количество символов в названии песни')
    pylab.ylabel('Количество символов в названии альбома')
    pylab.title('Соотношение длины названий (в символах)')
    pylab.show()


    correl = np.corrcoef(yy, xx)
    print("Корреляция длины названий песен и альбомов")
    print("Для продолжения закройте диаграмму")
    print("###")
    print(correl)
    plt.scatter(yy, xx)
    plt.show()


    print("Корреляция Пирсона", scipy.stats.pearsonr(yy, xx))
    print("\n")       
                								#LDA
    texts = []
    tokentoken = SpaceTokenizer()     #токенизация
    en_stop = get_stop_words('en')   #стоп-слова
    p_stemmer = PorterStemmer()     #стемминг      
    

    for i in textsong:
        raw = i.lower()
        tokens = tokentoken.tokenize(raw)

        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

    dic = corpora.Dictionary(texts)
    corps = [dic.doc2bow(text) for text in texts]

    ldamodel = gensim.models.ldamodel.LdaModel(corps, num_topics = 7, id2word = dic, passes = 20)
    try:
        plt.style.use('ggplot')
    except:
        pass

    print("Результат работы LDA:")
    print("###")
    print(ldamodel.print_topics(num_topics = 7, num_words = 3))
    print("\n")


    allwords = []
    for elements in texts:
        for allell in elements:
            allwords.append(allell)

    stoplist = set('for a of the and to in'.split())              #LSI 
    texts = [[word for word in document.lower().split() if word not in stoplist]
             for document in allwords]
    alltokens = sum(texts, [])
    tokens1 = set(word for word in set(alltokens) if alltokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens1] for text in texts]
    corp = [dic.doc2bow(text) for text in texts]
    lsi = models.lsimodel.LsiModel(corpus = corp, id2word = dic, num_topics = 3)
    print("Результат работы LSI:")
    print("###")
    print(lsi.print_topics(3))

    
    slovnik = {}

    with open("csvdict1.csv", 'r') as f:
        reader = csv.reader(f, delimiter = "\t")
        for row in reader:
            kluch1 = row[2].split(" ")
            for elemk in kluch1:
                kluch = elemk
                kluch = str(kluch).lower()
                if kluch in slovnik:
                    znach = slovnik[kluch]
                    slovnik[kluch] = znach + 1
                else:
                    slovnik[kluch] = 1

    stopslova = "a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves"
    sortklucha = sorted(slovnik, key = lambda x: int(slovnik[x]), reverse = True)
    zapis = open("slovnik.csv",'w')
    zapis2 = open("slovnikstopwords.csv", "w")

    zapis.write("Word" + "\t" + "Frequency" + "\n")
    zapis2.write("Word" + "\t" + "Frequency" + "\n")
    try:
        for kluch in sortklucha:
                jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch])
                zapis2.write(jjj)
        print("Создан частотный словарь")
    finally:
        zapis.close()
    try:
        for kluch in sortklucha:
            if kluch in stopslova:
                pass
            else:
                jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch])
                zapis2.write(jjj)
        print("Создан частотный словарь со стоп-словами")
    finally:
        zapis2.close()
예제 #7
0
파일: main.py 프로젝트: haniani/DH
def main():
    USER = "******"
    PASSWD = "089567"
    HOST = "localhost"
    CHARSET = "utf8mb4"
    DATABASE = "music"
    COLLATE = "utf8mb4_unicode_ci"
    #PORT = "3306"
    PATH = './'
    PATH_U = './users'

    x = sql_worker(USER, PASSWD, HOST, CHARSET, DATABASE, COLLATE)
    print("Соединение", x.connect())
    print("Создание", x.create())
    print("Использование", x.use(DATABASE))
    print("Ожидайте, обработка базы")

    dlalbom = []
    dlpesnya = []
    textsong = []
    artist = (x.select("select id,name from wc_lyricsnet_artists limit 1000"))
    for elem in artist:
        album = (x.select(
            "select `id`,`name` from `wc_lyricsnet_albums` where `artist_id` = '%s' and `year` = 1995;"
            % (elem[0])))
        for each in album:
            song = (x.select(
                "select `title` from `wc_lyricsnet_songs` where `artist_id` = '%s' and `album_id` = '%s';"
                % (elem[0], each[0])))
            for ef in song:
                ef = list(ef)
                ef = ''.join(map(str, ef))
                textsong.append(ef)
                #csvwww.write(str(elem[1]) + "\t" + str(each[1]) + "\t" + str(ef) + "\n") #создание словаря
                #print(elem[1])
                #print(each[1])
                #print(ef)
                dlinaalbom = len(str(each[1]))
                dlinasong = len(str(ef))
                dlalbom.append(dlinaalbom)
                dlpesnya.append(dlinasong)

    xx = np.array(dlalbom)
    yy = np.array(dlpesnya)
    pylab.plot(yy, xx, 'r')
    pylab.xlabel('Количество символов в названии песни')
    pylab.ylabel('Количество символов в названии альбома')
    pylab.title('Соотношение длины названий (в символах)')
    pylab.show()

    correl = np.corrcoef(yy, xx)
    print("Корреляция длины названий песен и альбомов")
    print("Для продолжения закройте диаграмму")
    print("###")
    print(correl)
    plt.scatter(yy, xx)
    plt.show()

    print("Корреляция Пирсона", scipy.stats.pearsonr(yy, xx))
    print("\n")
    #LDA
    texts = []
    tokentoken = SpaceTokenizer()  #токенизация
    en_stop = get_stop_words('en')  #стоп-слова
    p_stemmer = PorterStemmer()  #стемминг

    for i in textsong:
        raw = i.lower()
        tokens = tokentoken.tokenize(raw)

        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)

    dic = corpora.Dictionary(texts)
    corps = [dic.doc2bow(text) for text in texts]

    ldamodel = gensim.models.ldamodel.LdaModel(corps,
                                               num_topics=7,
                                               id2word=dic,
                                               passes=20)
    try:
        plt.style.use('ggplot')
    except:
        pass

    print("Результат работы LDA:")
    print("###")
    print(ldamodel.print_topics(num_topics=7, num_words=3))
    print("\n")

    allwords = []
    for elements in texts:
        for allell in elements:
            allwords.append(allell)

    stoplist = set('for a of the and to in'.split())  #LSI
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in allwords]
    alltokens = sum(texts, [])
    tokens1 = set(word for word in set(alltokens)
                  if alltokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens1] for text in texts]
    corp = [dic.doc2bow(text) for text in texts]
    lsi = models.lsimodel.LsiModel(corpus=corp, id2word=dic, num_topics=3)
    print("Результат работы LSI:")
    print("###")
    print(lsi.print_topics(3))

    slovnik = {}

    with open("csvdict1.csv", 'r') as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            kluch1 = row[2].split(" ")
            for elemk in kluch1:
                kluch = elemk
                kluch = str(kluch).lower()
                if kluch in slovnik:
                    znach = slovnik[kluch]
                    slovnik[kluch] = znach + 1
                else:
                    slovnik[kluch] = 1

    stopslova = "a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can't, cannot, could, couldn't, did, didn't, do, does, doesn't, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or, other, ought, our, ours ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself, yourselves"
    sortklucha = sorted(slovnik, key=lambda x: int(slovnik[x]), reverse=True)
    zapis = open("slovnik.csv", 'w')
    zapis2 = open("slovnikstopwords.csv", "w")

    zapis.write("Word" + "\t" + "Frequency" + "\n")
    zapis2.write("Word" + "\t" + "Frequency" + "\n")
    try:
        for kluch in sortklucha:
            jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch])
            zapis2.write(jjj)
        print("Создан частотный словарь")
    finally:
        zapis.close()
    try:
        for kluch in sortklucha:
            if kluch in stopslova:
                pass
            else:
                jjj = str("{0}\t{1}\n").format(kluch, slovnik[kluch])
                zapis2.write(jjj)
        print("Создан частотный словарь со стоп-словами")
    finally:
        zapis2.close()
class Bag_Of_Words_Graph_Builder(GraphBuilder):
    """The edge between two authors is the jaccard similarity between the bag of words of each author"""
    def __init__(self, db):
        GraphBuilder.__init__(self, db)

        self._author_guid_posts_dict = {}
        self._author_guid_bag_of_words_dict = {}
        self._word_dict = {}

        if self._domain == u'Microblog':
            self._tokenizer = TweetTokenizer()
        else:
            self._tokenizer = SpaceTokenizer()

    def execute(self, window_start=None):
        pass

    def fill_author_guid_posts_dictionary(self):
        self._author_guid_posts_dict = self._db.get_author_posts_dict_by_minimal_num_of_posts(
            self._domain, self._min_number_of_posts_per_author)

    def fill_author_guid_bag_of_words_dictionary(self):
        all_authors_count = len(self._author_guid_posts_dict.keys())
        i = 0
        # Dictionary: key = author_guid, value = list of posts
        for author_guid, posts in self._author_guid_posts_dict.iteritems():
            bow = []
            for post in posts:
                content = post.content
                content = content.lower()
                content = re.sub(r'http\S+', '', content)
                if content is not None:
                    bow += self._tokenizer.tokenize(content)

            bow = list(frozenset(bow))
            self._author_guid_bag_of_words_dict[author_guid] = bow

            for word in bow:
                if word not in self._word_dict:
                    self._word_dict[word] = word
            i += 1
            if i % 100000 == 0:
                print('\r done author ' + str(i) + ' out of ' +
                      str(all_authors_count),
                      end='')
        logging.info("done computing bag of words ")

    def compute_jaccard_index(self, set_1, set_2):
        n = len(set_1.intersection(set_2))
        return n / float(len(set_1) + len(set_2) - n)

    def fill_author_guid_bag_of_words_dictionary_and_calculate_all_combinations(
            self):
        self.fill_author_guid_bag_of_words_dictionary()

        author_guids = self._author_guid_bag_of_words_dict.keys()
        all_authors_count = len(author_guids)
        all_pairs = combinations(author_guids, 2)
        total_combinations = (all_authors_count * (all_authors_count - 1)) / 2
        """
        Casting all_pairs to an iterable object (frozenset) is NOT a good idea since combinations function returns a generator object,
        which is more memory and CPU efficient than iterable objects
        """
        logging.info("computing similarity between bag of words ")

        i = 0
        for author_guid_1, author_guid_2 in all_pairs:
            i += 1
            print('\r calculating pairs of authors : {0}/{1}'.format(
                i, total_combinations),
                  end='')
            author_guid_1_bag_of_words = self._author_guid_bag_of_words_dict[
                author_guid_1]
            author_guid_2_bag_of_words = self._author_guid_bag_of_words_dict[
                author_guid_2]

            self.calculate_jaccard_index_create_and_save_connection(
                author_guid_1, author_guid_2, author_guid_1_bag_of_words,
                author_guid_2_bag_of_words)

        self._db.save_author_connections(self._author_connections_edges)

    def calculate_jaccard_index_create_and_save_connection(
            self, author_guid_1, author_guid_2, author_guid_1_bag_of_words,
            author_guid_2_bag_of_words):

        weight = self.compute_jaccard_index(set(author_guid_1_bag_of_words),
                                            set(author_guid_2_bag_of_words))
        self._create_and_optional_save_connection(author_guid_1, author_guid_2,
                                                  weight)