Exemplo n.º 1
0
    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))
Exemplo n.º 2
0
        def _get_weights_lda(text,
                             stopwords,
                             lda_dictionary,
                             lda,
                             w2idx_embedding,
                             target_word,
                             lda_topics=False):
            my_punctuation = '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~'

            new_corpus = lda_dictionary.doc2bow(text)
            if not lda_topics:
                '''
                code removed
                '''


            number_of_topics_in_current_document = len(lda_topics)
            topic_word_matrix = lda.expElogbeta
            aux = np.reshape(lda_topics, (len(lda_topics), 2))
            topics = aux[:, 0]
            p_pertenencia_a_topics = aux[:, 1]

            word2id_in_file = Dictionary([text]).token2id
            word2id_global_lda = lda_dictionary.token2id
            # Initialize a matrix whose values will be each word's weight.
            # Shape: [number_of_topics_in_current_document, number_of_words]
            topic_filewords_matrix = np.zeros(
                (number_of_topics_in_current_document, len(word2id_in_file)),
                dtype=float)

            for k_top, topic in enumerate(topics):
                for word in word2id_in_file.keys():
                    '''
                    code removed
                    '''

            # Suma por columnas para tener el peso acumulado de una palabra en todos los topics del documento
            weight = np.sum(topic_filewords_matrix, axis=0)
            # Normalizo
            # w = weight / np.array([np.linalg.norm(weight, axis=0)]).T
            d = (np.sum(weight**2, )**(0.5))
            weight = (weight.T / d).T

            weight_words = {
                word: ww
                for word, ww in zip(word2id_in_file.keys(), weight)
                if (ww != 0 and word in self.w2idx_embedding)
            }

            return weight_words, lda_topics
 class MyCorpus(object):
     def __init__(self, input_file, K):
         self.K = K
         self.input_file = input_file
         self.dictionary = Dictionary()
         with open(input_file, "rt") as f:
             for line in f:
                 self.dictionary.add_documents([line.split()])
         self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K)
                 
     def __iter__(self):
         count = 1
         with open(self.input_file, "rt") as f:
             count += 1
             for line in f:
                 yield self.dictionary.doc2bow(line.rstrip().split())
                 
     def __str__(self):
         s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, "
         s += str(len(self.dictionary.keys())) + " features, "
         s += str(corpus.dictionary.num_nnz) + " non-zero entries)"
         return s
         
     def __repr__(self):
         return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
Exemplo n.º 4
0
def gemsim_tfidf(corpus):
    from gensim.models import TfidfModel
    from gensim.corpora import Dictionary
    corpus = [re.sub(r'[.|,]', '', line.lower()).split() for line in corpus]
    dct = Dictionary(corpus)
    corpus_as_bow = [dct.doc2bow(line) for line in corpus]
    print(corpus)
    sort_list = []
    for key in dct.keys():
        sort_list.append((key, dct[key], dct.dfs[key] * dct.cfs[key]))

    sort_list = sorted(sort_list, key=lambda item: item[2], reverse=True)
    keywords_list = sort_list[0:2 if len(sort_list) > 2 else len(sort_list)]

    keywords_doc = [word[1] for word in keywords_list]
    print(keywords_doc)

    print(dct.token2id)
    model_trained = TfidfModel(corpus_as_bow)
    for doc in model_trained[corpus_as_bow]:
        print(doc)

    index = similarities.MatrixSimilarity(model_trained[corpus_as_bow])
    keywords_tfidf = model_trained[dct.doc2bow(keywords_doc)]
    print(keywords_tfidf)
    sims = index[keywords_tfidf]
    print(sims)
    print(max(sims))
    max_idx = np.argmax(sims)
    print("most similar doc is {}".format(corpus[max_idx]))
Exemplo n.º 5
0
def test1():
    sentences = [['我吴彦祖', '我张学友'], ['吴彦祖我', '张学友我刘德华吴彦祖'], ['酸奶芝士', '芝士酸奶'],
                 ['芝士蛋糕', '酸奶芝士蛋糕']]
    ls_of_words = [jieba.lcut(sentence) for sentence in sentences]

    dt = Dictionary(ls_of_words).token2id
    ls_of_wids = [[dt[word] for word in words] for words in ls_of_words]

    dimension = len(dt)  # 维数
    matrix = np.matrix([[0] * dimension] * dimension)

    for ls in ls_of_wids:
        co_occurrence_matrix(matrix, ls)
    print(matrix)

    # 奇异值分解(Singular Value Decomposition)
    U, s, Vh = np.linalg.svd(matrix, full_matrices=False)

    # 聚类
    X = -U[:, 0:2]

    labels = KMeans(n_clusters=2).fit(X).labels_
    colors = ('y', 'g')

    mp.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
    for word in dt.keys():
        i = dt[word]
        mp.scatter(X[i, 1], X[i, 0], c=colors[labels[i]], s=400, alpha=0.4)
        mp.text(X[i, 1], X[i, 0], word, ha='center', va='center')
    mp.show()
class TermFrequency(object):
    """ Computes a term frequency distance_matrix
    """
    def __init__(self, documents):
        logging.log(logging.INFO, "Creating Term Frequency")

        self.id2Word = Dictionary(documents)
        self.num_unique_words = len(self.id2Word)
        self.distance_matrix = self.to_term_frequency_matrix(documents)

    def to_term_frequency_vector(self, document):
        return self.id2Word.doc2bow(document)

    def to_binary_vector(self, document):
        tf = self.id2Word.doc2bow(document)
        vect = sparse2full(tf, len(self.id2Word.keys()))
        return np.array(vect > 0, dtype=int)  # concerts to binary

    def to_term_frequency_matrix(self, documents):
        return [self.to_term_frequency_vector(d) for d in documents]

    def binary_matrix(self):
        """ Turns a regular tf distance_matrix into a binary distance_matrix """
        def get_binary_data(val):
            if val <= 0:
                return 0
            return 1

        full_matrix = MatrixHelper.gensim_to_python_mdarray(
            self.distance_matrix, self.num_unique_words)
        return [[get_binary_data(cell) for cell in row] for row in full_matrix]
class TermFrequency(object):
    """ Computes a term frequency distance_matrix
    """
    def __init__(self, documents):
        logging.log(logging.INFO, "Creating Term Frequency")
        
        self.id2Word = Dictionary(documents)
        self.num_unique_words = len(self.id2Word)
        self.distance_matrix = self.to_term_frequency_matrix(documents)

    def to_term_frequency_vector(self, document):
        return self.id2Word.doc2bow(document)


    def to_binary_vector(self, document):
        tf = self.id2Word.doc2bow(document)
        vect = sparse2full(tf, len(self.id2Word.keys()))
        return np.array( vect > 0, dtype=int ) # concerts to binary

    def to_term_frequency_matrix(self, documents):
            return [self.to_term_frequency_vector(d) for d in documents]

    def binary_matrix(self):
        """ Turns a regular tf distance_matrix into a binary distance_matrix """
        def get_binary_data(val):
            if val <= 0:
                return 0
            return 1
       
        full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words)
        return [[get_binary_data(cell)
                for cell in row]
                for row in full_matrix]
Exemplo n.º 8
0
class TFIDF():
    def __init__(self):
        pass

    def preprocess_tfidf(self):
        return [process_text(r) for r in get_db_records()]

    def create_tfidf_model(self):
        self.dataset = self.preprocess_tfidf()
        self.dct = Dictionary(self.dataset)
        self.dct.filter_extremes(no_below=50)
        corpus = [self.dct.doc2bow(line) for line in self.dataset]
        self.model = TfidfModel(corpus)

    def infer_tfidf(self):
        def infer(vector):
            dim = self.dct.keys()[-1] + 1
            text1 = self.model[self.dct.doc2bow(vector)]
            t1 = []
            for d in range(dim):
                t1_val = [i[1] for i in text1 if i[0] == d]
                if len(t1_val) == 1:
                    t1.append(t1_val[0])
                else:
                    t1.append(0)
            return t1

        return infer

    @staticmethod
    def load(filename):
        with open(filename, "rb") as f:
            return pickle.load(f)
    def test_dict_interface(self):
        """Test Python 2 dict-like interface in both Python 2 and 3."""
        d = Dictionary(self.texts)

        self.assertTrue(isinstance(d, Mapping))

        self.assertEqual(list(zip(d.keys(), d.values())), list(d.items()))

        # Even in Py3, we want the iter* members.
        self.assertEqual(list(d.items()), list(d.iteritems()))
        self.assertEqual(list(d.keys()), list(d.iterkeys()))
        self.assertEqual(list(d.values()), list(d.itervalues()))

        # XXX Do we want list results from the dict members in Py3 too?
        if not PY3:
            self.assertTrue(isinstance(d.items(), list))
            self.assertTrue(isinstance(d.keys(), list))
            self.assertTrue(isinstance(d.values(), list))
Exemplo n.º 10
0
Arquivo: G8.py Projeto: lum4chi/IR
def do_ir2(db, param):
    print 'Computazione di IR2', db, param, '...'

    def words(text):
        stopwords = set(nltk.corpus.stopwords.words('english'))
        return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords]

    class BigramsCorpus:
        def __init__(self, db, collection):
            self.client = MongoClient()[db][collection]

        def __iter__(self):
            for doc in self.client.find():
                yield [doc['_id']]

        def __len__(self):
            return self.client.count()

    bigram_corpus = BigramsCorpus('cordis', 'bi_grams')
    bigrams = Dictionary(bigram_corpus)

    project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}}
    a = [project]
    project_corpus = MongoCorpus('cordis', 'projects', aggregate=a)

    n = max(bigrams.keys())
    dataset = []

    for doc in project_corpus:
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        dataset.append(x)

    alg = KMeans(n_clusters=int(param))
    alg.fit(dataset)

    clusters = defaultdict(list)
    for i, doc in enumerate(project_corpus):
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        p = alg.predict([x])
        clusters[p[0]].append(doc['reference'])

    mongo_clusters = []
    for k, v in clusters.items():
        mongo_clusters.append({'cluster': k, 'projects': v})

    # Mongo da questo errore: InvalidDocument: Cannot encode object: 0
    print mongo_clusters
    # Salva su collezione Mongo
    mongo = MongoClient()['g8']['ir2']
    mongo.insert_many(mongo_clusters)
    print 'Fatto!'
Exemplo n.º 11
0
def tfidf_train(table,
                tokens_col,
                tf_weighing='n',
                df_weighing='t',
                document_normalization='c'):

    out_table = table.copy()
    _corpus = out_table[tokens_col]
    _smartirs = tf_weighing + df_weighing + document_normalization

    _dictionary = Dictionary(_corpus)
    _corpus = [_dictionary.doc2bow(text) for text in _corpus]

    _model = TfidfModel(_corpus, smartirs=_smartirs)
    _corpus = [text for text in _model[_corpus]]

    _sparse_matrix = corpus2csc(_corpus, num_terms=len(_dictionary.token2id)).T

    _values = [value for value in _dictionary.values()]
    _keys = [key for key in _dictionary.keys()]
    _dic = pd.DataFrame({'indice': _keys, 'word': _values})
    rb = ReportBuilder()
    rb.addMD(
        strip_margin("""
    | ## Dictionary
    | {table1}
    """.format(table1=pandasDF2MD(_dic))))

    out_table['sparse_vectors'] = sparse_encode(
        _sparse_matrix)['sparse_vectors']

    fit_model = dict()
    fit_model['dictionary'] = _dictionary
    fit_model['model'] = _model
    fit_model['report'] = rb.get()
    return {'out_table': out_table, 'fit_model': fit_model}
word2vec.

train.text.apply(lambda x: (map(x,dictionary.token2id)))

list(map(dictionary.token2id.get,['allah']))

dictionary.token2id

test.text=test.text.apply(lambda x: dictionary.doc2idx(x))

test_text=pad_sequences(test.text)
#test_target=test.target.values

test_text.shape,train_text.shape

len(dictionary.keys())

from sklearn.model_selection import train_test_split
train_x,val_x,train_y,val_y=train_test_split(train_text,train_target,test_size=.2)

from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout,LSTM

vocab_size=len(dictionary.keys())
input_dimension=train_x.shape[1]



def create_model():
    model=Sequential()
    model.add(Embedding(vocab_size,16,input_length=input_dimension))
Exemplo n.º 13
0
data = data.split(" = ")
datagensim = []
regex = re.compile('[^a-zA-Z ]')
for d in data:

    #First parameter is the replacement, second parameter is your input string
    test = regex.sub('', d)
    #Out: 'abdE'
    if len(test) > 100:
        datagensim += [[i for i in test.split(" ") if len(i) > 3]]

dct = Dictionary(datagensim)
dct.filter_extremes(no_below=2, no_above=0.9)
dct.compactify()
X = np.zeros((len(dct.keys()), len(datagensim)), int)
i = 0
bow = []
datagensimClean = []
for d in datagensim:

    idx = dct.doc2idx(d)
    dC = [d[i] for i in range(len(d)) if idx[i] > -1]
    tmp = dct.doc2bow(dC)
    datagensimClean += [dC]
    bow += [tmp]
    for key, value in tmp:
        X[key, i] = value
    i += 1

datagensim = datagensimClean
Exemplo n.º 14
0
            elif (el['paperId'] in S2_Proto) and (S2_Proto[el['paperId']]
                                                  in paper_ids):
                npapers = npapers + 1
                tokens = tokens + paper_corpus[paper_ids.index(
                    S2_Proto[el['paperId']])]
        author_npapers.append(npapers)
        author_corpus.append(tokens)

dct = Dictionary(author_corpus)  # fit dictionary
corpus_bow = [dct.doc2bow(line)
              for line in author_corpus]  # convert corpus to BoW format

model = TfidfModel(corpus_bow)

corpus_tfidf = [model[el] for el in corpus_bow]
vocab_tfidf = [dct[el] for el in dct.keys()]

with vocab_tfidf_file.open('w') as fout:
    [
        fout.write(str(idx) + ':' + wd + '\n')
        for idx, wd in enumerate(vocab_tfidf)
    ]

with corpus_tfidf_file.open('w') as fout:
    for an, anp, doc_tfidf in zip(author_names, author_npapers, corpus_tfidf):
        fout.write(an.replace(' ', '_') + ' ' + str(anp))
        for token in doc_tfidf:
            fout.write(' ' + str(token[0]) + ':' + str(token[1]))

        fout.write('\n')
Exemplo n.º 15
0
datagensim = []
regex = re.compile('[^a-zA-Z ]')
for d in data[:200]:
    

    #First parameter is the replacement, second parameter is your input string
    test = regex.sub('', d)
    #Out: 'abdE'
    if len(test)>100:
        datagensim += [[i.lower() for i in test.split(" ") if len(i)>2]]
#gensim.utils.lemmatize(
dct = Dictionary(datagensim)
dct.filter_extremes(keep_n=50000, no_above=0.8 )
dct.compactify()
X = np.zeros((len(dct.keys()),len(datagensim)),int)
i = 0
bow = []
datagensimClean = []
for d in datagensim:
    
    idx = dct.doc2idx(d)
    dC = [d[i] for i in range(len(d)) if idx[i]>-1]
    tmp = dct.doc2bow(dC)
    datagensimClean += [dC]
    bow += [tmp] 
    for key, value in tmp:
        X[key,i] = value
    i +=1
    
datagensim =  datagensimClean 
Exemplo n.º 16
0
def analyze(originfile, all=False):
    keywords = helper.getKeywords(originfile)
    os.chdir('./resources/stanford-corenlp-full-2018-10-05')
    os.system('kill $(lsof -t -i:9000)')
    cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &'
    time.sleep(4)
    print("starting nlp service")
    with open(os.devnull, "w") as f:
        subprocess.call(cmd, shell=True, stderr=f, stdout=f)
    time.sleep(4)
    print("nlp service started")
    os.chdir('../../')
    nlp_wrapper = StanfordCoreNLP('http://localhost:9000')
    print("Number of processors: ", mp.cpu_count())
    if all:
        print("all")
        '''if not os.path.isfile('/resources/all_test.csv'):
            print("test file created")
            open('./resources/all_test.csv', 'w').close()'''
        conn = db.db_connection()
        dbo = db.db_operator(conn)
        spell = SpellChecker()
        counter = Value('i', 1)
        corpus_tok_all=[]
        '''for i in range(1790):
            print('i=' +str(i))
            print("limit= 10000")
            print("offset= "+str(10000*i))
            conn.connect()
            query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \
                    'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \
                    'FROM masterthesis.reviews, masterthesis.hotels ' \
                    'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';'
            results = [list(x) for x in dbo.execute(query)];
            conn.disconnect()
            print("got results from sql")
            print("starting analysis")
            print("tot number rows= " + str(len(results)))
            try:
                print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), )
                corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200)
                pool.close()
                pool.terminate()
                pool.join()
                print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            except TimeoutError:
                print("timeout error")
                pool.close()
                pool.terminate()
                pool.join()
                corpus_tok=[]
                for doc in results:
                    try:
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60)
                        #print('pool close')
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()

                    except TimeoutError:
                        print(str(doc)+" caused Exception")
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()
                        c=[None]
                    corpus_tok.append(c[0])
            print("beginning removal of sents with contrast")
            corpus_tok = [r for r in corpus_tok if r != None]
            print('len corpus_tok_reduced= '+str(len(corpus_tok)))
            corpus_tok_all+=corpus_tok
            print('len corpus_tok_all= ' + str(len(corpus_tok_all)))
            if i%100==0 and i!=0:
                with open('./resources/all_test.csv', mode='a') as file:
                    writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    for c in corpus_tok_all:
                        writer.writerow(c)
                file.close()
                corpus_tok_all=[]
        '''


        '''
        corpus_tok_all=[]
        i=0
        kk=set()
        with open('./resources/all_test.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                #if i%10000==0:break
                ar=((row[0].replace('[','')).replace(']','')).split(',')
                if ar[1][-1]!="'":#France, Metro.
                    ar[1]=ar[1]+','+ar[2]
                    for j in range(2,len(ar)-1):
                        ar[j]=ar[j+1]
                    del ar[len(ar)-1]
                ar[1]=ar[1][2:-1]
                ar[2] = (ar[2].replace("'", '')).replace(' ', '')
                rev=''.join(ar[3:])
                revlist= ar[:3]
                revlist.append(rev)
                tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',')
                r=(revlist,tokens)
                k=ar[0]
                if k not in kk:
                    kk.add(k)
                    corpus_tok_all.append(r)
        file.close()
        corpus_tok=corpus_tok_all
        corpustokonly = [r[1] for r in corpus_tok]
        print("doing bigrams")
        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
        lenc=len(corpus_tok)
        print("corpus_tok len = "+str(lenc))
        for idx in range(lenc):
            if idx%100000==0:
                print(idx)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            for token in bigram[corpustokonly[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    corpus_tok[idx][1].append(token)
        with open('./resources/corpus_tok_all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                writer.writerows(corpus_tok)
        file.close()
        print("corpus_tok written")
        from gensim.corpora import Dictionary
        print("writing frequence file")
        '''

        



        '''all_set=set()
        for emotion in ['Good', 'Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if not (keyword == 'cleaning' or keyword=='pet'):
                    start_time = time.time()
                    print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    raw_corpus = helper.getRawCorpus(
                        csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                      encoding="utf8", newline='\n'), additionaldetails=True)
                    # corpus = helper.getCorpusTextFromRaw(raw_corpus)
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    print("starting analysis")
                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,
                                   initargs=(counter, spell, nlp_wrapper,), )
                    corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get()
                    print('pool close')
                    pool.close()
                    print('pool join')
                    pool.join()
                    print("beginning removal of sents with contrast")
                    corpus_tok = [r for r in corpus_tok if r != None]
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    print('len all_set_tok before= ' + str(len(all_set)))
                    print('len corpus_tok= ' + str(len(corpus_tok)))
                    print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set)))
                    for sen in corpus_tok:
                        all_set.add((tuple(sen[0]),tuple(sen[1])))
                    print('len all_set_tok after= ' + str(len(all_set)))
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
        # Compute bigrams.
        if len(all_set) > 0:
            corpus_tok=[(list(x[0]),list(x[1])) for x in all_set]
            corpustokonly = [r[1] for r in corpus_tok]
            print("doing bigrams")
            # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
            bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
            for idx in range(len(corpus_tok)):
                for token in bigram[corpustokonly[idx]]:
                    if '_' in token:
                        # Token is a bigram, add to document.
                        corpus_tok[idx][1].append(token)
            from gensim.corpora import Dictionary
            print("writing frequence file")

            # Create a dictionary representation of the documents.
            dictionary = Dictionary(corpustokonly)

            alltok = []
            freq = []
            for doc in corpustokonly:
                for tok in doc:
                    alltok.append(tok)
            lencorpus = len(corpus_tok)
            print("len dictionary = " + str(len(dictionary.keys())))
            i = 0
            for t in dictionary:
                i += 1
                if i % 1000 == 0:
                    print("analyzing token " + str(i))
                freqsent = 0
                for doc in corpustokonly:
                    if dictionary.get(t) in doc:
                        freqsent += 1
                freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)),
                             alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus))
            freq.sort(key=lambda tup: tup[5], reverse=True)
            for i in range(len(freq)):
                freq[i] = tuple(list(freq[i]) + [i])
            if not os.path.exists('resources/bow/allfreq/stanford/'):
                os.makedirs('resources/bow/allfreq/stanford/')
            with open('resources/bow/allfreq/stanford/all.txt',
                      'w') as f:
                for item in freq:
                    f.write(str(item) + '\n')
                f.close()

            print("writing bow file")
            top_tokens = [f[1] for f in freq[:500]]
            lentoptok = len(top_tokens)
            corpus_bow = {}
            toplen = 0
            for i in range(len(corpus_tok)):
                corpus_bow[i] = [0] * lentoptok
                if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                    toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
                for tok in corpus_tok[i][1]:
                    if tok in top_tokens:
                        corpus_bow[i][top_tokens.index(tok)] = 1

            with open('resources/bow/all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow([''] * toplen + top_tokens)
                for i in corpus_bow.keys():
                    writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * (
                            toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i])
            file.close()
        '''
        


        # Create a dictionary representation of the documents.
        '''dictionary = Dictionary(corpustokonly)

        alltok = []
        freq = []
        for doc in corpustokonly:
            for tok in doc:
                alltok.append(tok)
        lencorpus = len(corpus_tok)
        print("len dictionary = " + str(len(dictionary.keys())))
        time.sleep(100000)
        counter = Value('i', 0)
        pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), )
        print("pool initialized")
        corpustokonly=None
        alltok=None
        del corpustokonly, alltok
        freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get()
        pool.close()
        pool.terminate()
        pool.join()
        dictionary=None
        del dictionary
        global ctonly, dic, alltoks
        ctonly=None
        dic=None
        alltoks=None
        del ctonly,dic,alltoks
        print("frequence list len= "+str(len(freq)))
        print("frequence list created")
        freq.sort(key=lambda tup: tup[5], reverse=True)
        print("frequence list sorted")
        for i in range(len(freq)):
            if i%10000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            freq[i] = tuple(list(freq[i]) + [i])
        print("frequence list modified")
        if not os.path.exists('resources/bow/allfreq/stanford/'):
            os.makedirs('resources/bow/allfreq/stanford/')
        i=0
        '''
        '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f:
            for item in freq:
                i+=1
                if i%10000==0:
                    print(i)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                f.write(str(item) + '\n')
            f.close()'''

        corpus_tok=[]
        i=0
        with open('./resources/corpus_tok_all.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                corpus_tok.append(row)
        file.close()
        print("len corpus_tok= "+str(len(corpus_tok)))
        freq=[]
        i=0
        with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i==501:break
                freq.append(row)
        file.close()
        for i in range(len(freq)):
            freq[i]=freq[i][0]
            freq[i]=freq[i].replace("'",'')
            freq[i]=freq[i].replace('"','')
            freq[i]=freq[i].replace('(','')
            freq[i]=freq[i].replace(')','')
            freq[i]=freq[i].replace(' ','')
            freq[i]=freq[i].split(',')
            freq[i]=tuple(freq[i])
        for i in range(len(corpus_tok)):
            if i%100000==0:
                print(i)
            corpus_tok[i][0]=corpus_tok[i][0].replace('[','')
            corpus_tok[i][0]=corpus_tok[i][0].replace(']','')
            det=(corpus_tok[i][0].split(','))
            if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe
                det[1]='  '+'São Tomé and PrÃ\xadncipe'+' '
            if det[1][-1]!="'":#France, Metro
                if 'Ivoire' in det[1]:#Cote d'Ivoire
                    det[1]=det[1].replace('\\','')
                    det[2]=det[2][1:]
                else:
                    det[1]=det[1]+','+det[2]
                    for j in range(2,len(det)-1):
                        det[j]=det[j+1]
                    del det[len(det)-1]
            det=det[:3]
            desc=(corpus_tok[i][0].split(','))[-1]
            det[0]=det[0][1:-1]
            det[1]=det[1][2:-1]
            det[2]=det[2][2:-1]
            desc=desc[3:-1]
            det.append(desc)
            corpus_tok[i][0]=det
            corpus_tok[i][1]=corpus_tok[i][1].replace("'",'')
            corpus_tok[i][1]=corpus_tok[i][1].replace(' ','')
            corpus_tok[i][1]=corpus_tok[i][1].replace('[','')
            corpus_tok[i][1]=corpus_tok[i][1].replace(']','')
            corpus_tok[i][1]=corpus_tok[i][1].split(',')
        print("writing bow file")
        top_tokens = [f[1] for f in freq[:400]]
        lentoptok = len(top_tokens)
        corpus_bow = {}
        toplen = 0
        print("corpus_tok_len= "+str(len(corpus_tok)))
        for i in range(len(corpus_tok)):
            corpus_bow[i] = [0] * lentoptok
            if i%100000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
            for tok in corpus_tok[i][1]:
                if tok in top_tokens:
                    corpus_bow[i][top_tokens.index(tok)] = 1
        print("len corpus_bow keys= "+str(len(corpus_bow.keys())))
        print("got corpus_bow")
        j=0
        print("corpus_bow_len "+str(len(corpus_bow)))
        with open('resources/bow/all.csv', mode='w') as file:
            writer = csv.writer(file, delimiter='|', quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([''] * toplen + top_tokens)
            for i in corpus_bow.keys():
                j+=1
                if j%100000==0:
                    print(j)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                writer.writerow(
                    corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) +
                    corpus_bow[i])
        file.close()
        print("over")
    else:
        print("not all")
        for emotion in ['Good','Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if emotion=='Good' and keyword=='cleaning':#cleaning good
                    start_time = time.time()
                    print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    corpus_tok_all=[]
                    #if not os.path.isfile('/resources/cleaning_test.csv'):
                        #open('./resources/cleaning_test.csv', 'w').close()
                    for i in range(400):#400
                        print(str(i))
                        offset=i*1000
                        limit=1000
                        print("starting reading")
                        print("limit="+str(limit))
                        print("offset="+str(offset))
                        raw_corpus = helper.getRawCorpus(
                            csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                          encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset)

                        #corpus = helper.getCorpusTextFromRaw(raw_corpus)
                        #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)]
                        #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):]
                        print("starting analysis")
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        try:
                            corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30)
                            pool.close()
                            pool.join()
                        except TimeoutError:
                            print("timeout error")
                            print('pool close')
                            pool.close()
                            print('pool terminate')
                            pool.terminate()
                            print('pool join')
                            pool.join()
                            corpus_tok=[]
                            for doc in raw_corpus:
                                try:
                                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                                    c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30)
                                    #print('pool close')
                                    pool.close()
                                    #print('pool join')
                                    pool.join()
                                    '''thread = threading.Thread(target = thread_function_row_only, args = (doc))
                                    thread.start()
                                    thread.join()
                                    c=que.get()'''
                                except TimeoutError:
                                    print(str(doc)+" caused Exception")
                                    c=[None]
                                corpus_tok.append(c[0])
                        corpus_tok_reduced=[r for r in corpus_tok if r != None]
                        print("len corpus_tok: " + str(len(corpus_tok)))
                        print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced)))
                        '''with open('./resources/cleaning_test.csv', mode='a') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                            for c in corpus_tok_reduced:
                                writer.writerow(c)
                        file.close()'''
                        corpus_tok_all+=corpus_tok_reduced
                        print("len corpus_tok_all: " + str(len(corpus_tok_all)))
                    '''
                    corpus_tok=[]
                    s=0
                    for doc in corpus:
                        newdoc=False
                        doc = doc.lower()
                        s += 1
                        if s % 10000 == 0:
                            print(str(s))
                        for con in constr_conjs:
                            if con in doc:
                                newdoc=True
                                break
                        if not newdoc:
                            toks = [spell.correction(tok['lemma']) for tok in
                                    nlp_wrapper.annotate(doc,
                                                         properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[
                                        'sentences'][0]['tokens']
                                    if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1]
                            toapp = []
                            for i in range(len(toks)):
                                if '/' in toks[i]:
                                    for tok in toks[i].split('/'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            toapp = []
                            for i in range(len(toks)):
                                if '-' in toks[i]:
                                    for tok in toks[i].split('-'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            corpus_tok.append(toks)'''
                    #print("beginning removal of sents with contrast")
                    corpus_tok=corpus_tok_all
                    print("len corpus_tok: " + str(len(corpus_tok)))
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    # Compute bigrams.
                    if len(corpus_tok)>0:
                        corpustokonly=[r[1] for r in corpus_tok]
                        print("doing bigrams")
                        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
                        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
                        for idx in range(len(corpus_tok)):
                            for token in bigram[corpustokonly[idx]]:
                                if '_' in token:
                                    # Token is a bigram, add to document.
                                    corpus_tok[idx][1].append(token)
                        from gensim.corpora import Dictionary
                        print("writing frequence file")

                        # Create a dictionary representation of the documents.
                        dictionary = Dictionary(corpustokonly)

                        alltok = []
                        freq=[]
                        for doc in corpustokonly:
                            for tok in doc:
                                alltok.append(tok)
                        lencorpus=len(corpus_tok)
                        print("len dictionary = "+str(len(dictionary.keys())))
                        i=0
                        for t in dictionary:
                            i+=1
                            if i%1000==0:
                                print("analyzing token "+str(i))
                            freqsent = 0
                            for doc in corpustokonly:
                                if dictionary.get(t) in doc:
                                    freqsent+=1
                            freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus))
                        freq.sort(key=lambda tup: tup[5], reverse=True)
                        for i in range(len(freq)):
                            freq[i]=tuple(list(freq[i])+[i])
                        if not os.path.exists('resources/bow/allfreq/stanford/'):
                            os.makedirs('resources/bow/allfreq/stanford/')
                        with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f:
                            for item in freq:
                                f.write(str(item)+'\n')
                            f.close()

                        print("writing bow file")
                        top_tokens=[f[1] for f in freq[:500]]
                        lentoptok=len(top_tokens)
                        corpus_bow={}
                        toplen=0
                        for i in range(len(corpus_tok)):
                            corpus_bow[i]=[0]*lentoptok
                            if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen:
                                toplen=len(corpus_tok[i][0]+corpus_tok[i][1])
                            for tok in corpus_tok[i][1]:
                                if tok in top_tokens:
                                    corpus_bow[i][top_tokens.index(tok)]=1

                        with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"',
                                                         quoting=csv.QUOTE_MINIMAL)
                            writer.writerow(['']*toplen+top_tokens)
                            for i in corpus_bow.keys():
                                writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i])
                        file.close()
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
    f.close()
Exemplo n.º 17
0
class LdaSelector(QObject):
    # 训练测试主题词结束信号
    train_test_over_msg = pyqtSignal()
    # 提取主题词结束信号,第一个str参数是期刊名,第二个str参数是年份,int参数是提取主题词的个数
    select_over_msg = pyqtSignal(str, str, int)
    # 当前文件的绝对路径
    abspath = os.path.dirname(__file__)

    def __init__(self):
        QObject.__init__(self)
        self.dictionary = Dictionary
        self.corpus_a = list

    def perplexity(self, ldamodel, testset, dictionary, size_dictionary,
                   num_topics):
        """calculate the perplexity of a lda-model"""
        # dictionary : {7822:'deferment', 1841:'circuitry',19202:'fabianism'...]
        # print ('the info of this ldamodel: \n')
        # print ('num of testset: %s; size_dictionary: %s; num of topics: %s'%(len(testset), size_dictionary, num_topics))
        prep = 0.0
        prob_doc_sum = 0.0
        topic_word_list = [
        ]  # store the probablity of topic-word:[(u'business', 0.010020942661849608),(u'family', 0.0088027946271537413)...]
        for topic_id in range(num_topics):
            topic_word = ldamodel.show_topic(topic_id, size_dictionary)
            dic = {}
            for word, probability in topic_word:
                dic[word] = probability
            topic_word_list.append(dic)
        doc_topics_ist = [
        ]  # store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...]
        for doc in testset:
            doc_topics_ist.append(
                ldamodel.get_document_topics(doc, minimum_probability=0))
        testset_word_num = 0
        for i in range(len(testset)):
            prob_doc = 0.0  # the probablity of the doc
            doc = testset[i]
            doc_word_num = 0  # the num of words in the doc
            for word_id, num in doc:
                prob_word = 0.0  # the probablity of the word
                doc_word_num += num
                word = dictionary[word_id]
                for topic_id in range(num_topics):
                    # cal p(w) : p(w) = sumz(p(z)*p(w|z))
                    prob_topic = doc_topics_ist[i][topic_id][1]
                    prob_topic_word = topic_word_list[topic_id][word]
                    prob_word += prob_topic * prob_topic_word
                prob_doc += math.log(prob_word)  # p(d) = sum(log(p(w)))
            prob_doc_sum += prob_doc
            testset_word_num += doc_word_num
        prep = math.exp(
            -prob_doc_sum /
            testset_word_num)  # perplexity = exp(-sum(p(d)/sum(Nd))
        # print ("the perplexity of this ldamodel is : %s"%prep)
        return prep

    def build_corpus(self, journal, year):
        input_path = self.abspath + '/data/journal_year/' + journal + '/'
        input_filename = year + '.csv'
        input_file_big = open(input_path + input_filename,
                              'r',
                              encoding='utf-8',
                              errors='ignore').readlines()
        list_stopWords = list(set(stopwords.words('english')))
        # 转大小写
        input_file = [text.lower() for text in input_file_big]
        # 分词
        list_words = [word_tokenize(text) for text in input_file]
        # 过滤停止词
        filtered_words = [[w for w in text if not w in list_stopWords]
                          for text in list_words]
        # 过滤标点
        english_punctuations = [
            ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@',
            '#', '$', '%', '’', '≤', 'a.', 'b.', 'c.', 'd.', 'e.', 'm.', 'n.',
            'p.', 'f.', 'g.', 'h.', 'i.', 'j.', 'k.', 'l.', 'o.', 'q.', 'r.',
            's.', 't.', 'u.', 'v.', 'w.', 'x.', 'y.', 'z.'
        ]
        text_list = [[
            word for word in text if word not in english_punctuations
        ] for text in filtered_words]
        dropword = [
            'model', 'method', 'published', 'results', 'using', 'study', 'The',
            '\'\'', '``', 'two', 'paper', 'online'
        ]
        text_list2 = [[word for word in text if word not in dropword]
                      for text in text_list]
        # 过滤数字
        train_set = [[
            word for word in text if bool(re.search(r'\d', word)) == False
        ] for text in text_list2]
        # res=[]
        # for word in text_list2:
        #     if bool(re.search(r'\d', word))==False:
        #         res.append(word)
        #     else:
        #         pass
        # 构建训练语料
        self.dictionary = Dictionary(train_set)
        self.dictionary.filter_extremes(no_below=5, no_above=0.5)
        self.corpus_a = [self.dictionary.doc2bow(text) for text in train_set]

    def train_test(self, journal, year, upper_bound, lower_bound, step):
        self.build_corpus(journal, year)
        # 分训练、测试集
        tfidf = models.TfidfModel(self.corpus_a)
        corpus = tfidf[self.corpus_a]
        p = int(len(corpus) * .8)
        cp_train = corpus[0:p]
        cp_test = corpus[p:]
        # lda模型训练
        # 2013 年开始50个主题
        grid = dict()
        for topic in range(lower_bound, upper_bound, step):
            # grid[topic]=[]
            grid[topic] = []
            # lda = LdaModel(corpus=corpus_a, id2word=dictionary, num_topics=topic,passes=2,update_every=0,alpha='auto',iterations = 500)
            lda = LdaModel(corpus=cp_train,
                           id2word=self.dictionary,
                           num_topics=topic,
                           passes=2,
                           update_every=0,
                           alpha='auto',
                           iterations=500)
            # test_perplexity=lda.log_perplexity(cp_test)
            # perplex= lda.bound(cp_test)
            # test_perplexity = numpy.exp2(-perplex / sum(cnt for document in cp_test for cnt in document))
            test_perplexity = self.perplexity(lda, cp_test, self.dictionary,
                                              len(self.dictionary.keys()),
                                              topic)
            print(topic)
            print(test_perplexity)
            grid[topic].append(test_perplexity)

        df = pd.DataFrame(grid)
        plt.figure(figsize=(14, 8), dpi=120)
        plt.subplot(221)
        plt.plot(df.columns.values, df.iloc[0].values, '#007A99', linewidth=2)
        plt.xticks(df.columns.values)
        plt.ylabel(journal + '_' + year + '_test_perplexity')
        plt.show()
        self.train_test_over_msg.emit()

    def select_lda(self, journal, year, num_topics):
        self.build_corpus(journal, year)
        # 输出
        lda = LdaModel(corpus=self.corpus_a,
                       id2word=self.dictionary,
                       num_topics=num_topics,
                       passes=2,
                       update_every=0,
                       alpha='auto',
                       iterations=500)
        output_path = self.abspath + '/data/lda_topic/' + journal + '/'
        output_filename = year + '.txt'
        with open(output_path + output_filename,
                  'w',
                  newline='',
                  encoding='UTF-8') as f:
            for i in range(0, num_topics):
                input_str = lda.show_topic(i, topn=30)[0][0] + ':' + str(
                    lda.show_topic(i, topn=30)[0][1])
                for j in range(1, len(lda.show_topic(i, topn=30))):
                    word = lda.show_topic(i, topn=30)[j][0] + ':' + str(
                        lda.show_topic(i, topn=30)[j][1])
                    input_str = input_str + ',' + word
                f.write(input_str + '\n')
        self.select_over_msg.emit(journal, year, num_topics)
def prepare_text_for_fitting(full_texts, sentences, nlp, **kwargs):
    #Grap and parse the chapters/sentences from the input corpus
    chapters = full_texts.split('\n\n\n\n\n\n')
    p_chapters = [
        tokenize(nlp(chapter_return(chapter))) for chapter in chapters
    ]
    p_sentences = [tokenize(nlp(sentence)) for sentence in sentences]
    #Create gensim dictionaries and carefully filter the high/low occurring words.
    text_dict = Dictionary(p_chapters)
    sentence_dict = Dictionary(p_sentences)
    text_dict.filter_extremes(no_below=4, no_above=0.22)
    print len(text_dict)
    text_dict.compactify()
    text_dict[text_dict.keys()[0]]
    #Get the bag of word representation for every word in each chapter
    chap_corpus = [text_dict.doc2bow(c) for c in p_chapters]
    #sent_corpus = [text_dict.doc2bow(s) for s in p_sentences]
    #The GloVe vector representation of each word in all of the chapters
    tf_idf_glove = np.vstack(
        [nlp(text_dict[i]).vector for i in range(len(text_dict))])
    #Create a normed set of the vectors for easy similarity scoring
    normed_vecs = copy.deepcopy(tf_idf_glove)
    for i, nv in enumerate(normed_vecs):
        normed_vecs[i] = nv / np.linalg.norm(nv)
    #Get the bag of word rep. for each applicable sentence.
    #If a word is not in the dictionary, we grab and weight the most similar available word.
    sent_corpus = [
        get_sent_bow(s, text_dict, nlp, preload=normed_vecs)
        for s in p_sentences
    ]
    #pickle.dump(sent_corpus,open('raw_count_mat.pckl','wb'))
    #Could use atn or ntn as well as ltn
    if os.path.isfile('tf_idf_sent_mat_samp4.pckl'):
        sent_vecs = pickle.load(open('tf_idf_sent_mat_samp4.pckl', 'rb'))
    else:
        #Create a TF-IDF model for the text as a whole
        model_tfidf = TfidfModel(chap_corpus,
                                 id2word=text_dict,
                                 smartirs='ltn')
        model_tfidf.save('tfidf_model_samp4')
        #Apply the model to each word in the applicable sentences
        sent_tfidf = model_tfidf[sent_corpus]
        #Unpack each TF-IDF vector
        sent_vecs = np.vstack(
            [sparse2full(c, len(text_dict)) for c in sent_tfidf])
        pickle.dump(sent_vecs, open('tf_idf_sent_mat_samp4.pckl', 'wb'))

    if os.path.isfile('glove_sent_mat_samp4.pckl'):
        sent_glove_mat = pickle.load(open('glove_sent_mat_samp4.pckl', 'rb'))
    else:
        #Weight the glove vector representation by the appropriate TF-IDF values
        sent_glove_mat = np.dot(sent_vecs, tf_idf_glove)
        pickle.dump(sent_glove_mat, open('glove_sent_mat_samp4.pckl', 'wb'))
    if os.path.isfile('sent_w2v_mat_samp4.pckl'):
        sent_w2v_mat = pickle.load(open('sent_w2v_mat_samp4.pckl', 'rb'))
    else:
        #Create a 250 element Word2Vec modeller
        model_w2v = Word2Vec(p_chapters, size=250, window=7)
        #Train it over 10 epochs
        model_w2v.train(p_chapters,
                        total_examples=model_w2v.corpus_count,
                        epochs=10)
        model_w2v.init_sims()
        model_w2v.save('word2vec_model_samp4')

        #Fix non-included ones
        ids = []
        #Collect the dict. ID's for the intersection of the w2v and text vocabs.
        for k in model_w2v.wv.vocab:
            try:
                ids.append(text_dict.token2id[k])
            except KeyError:
                pass
        #[text_dict.token2id[k] for k in model_w2v.wv.vocab]
        #Create the new, smaller subset dictionary
        filt_dict = {new_id: text_dict[new_id] for new_id in ids}
        #Deal with the id numbers being off.
        blah = zip(list(np.sort(ids)), range(len(model_w2v.wv.vocab)))
        renum_dict = dict(blah)
        #Subset corpus
        filt_sent_corp = []
        for i in range(len(p_sentences)):
            corp_ = []
            for p in sent_corpus[i]:
                if p[0] in ids:
                    corp_.append((renum_dict[p[0]], p[1]))
            filt_sent_corp.append(corp_)
        #New, smaller Word2Vec model
        tdidf_w2v = TfidfModel(filt_sent_corp,
                               id2word=filt_dict,
                               smartirs='ltn')
        sent_w2v_tdidf = tdidf_w2v[filt_sent_corp]
        #Appropriate TF-IDF vectors
        w2v_tfidf_vecs = np.vstack(
            [sparse2full(c, len(filt_dict)) for c in sent_w2v_tdidf])

        #Collect all of the appropriate Word2Vectors
        w2v_vecs = [
            model_w2v.wv[filt_dict[filt_dict.keys()[i]]]
            for i in range(len(filt_dict))
        ]
        w2v_vecs = np.array(w2v_vecs)
        w2v_vecs.shape = (len(filt_dict), 250)

        sent_w2v_mat = np.dot(w2v_tfidf_vecs, w2v_vecs)
        pickle.dump(sent_w2v_mat, open('w2v_sent_mat_samp4.pckl', 'wb'))

    return sent_vecs, sent_glove_mat, sent_w2v_mat
Exemplo n.º 19
0
def lda_scratch(topic_num, alpha, beta, passes):
    docs_list = []
    with open(filtered_data_file, 'r', encoding="UTF-8") as f:
        for line in f.readlines():
            docs_list.append(line.split())
    dictionary = Dictionary(docs_list)

    docs_idx_list = []

    for doc in docs_list:
        one_doc_idx = []
        for word in doc:
            one_doc_idx.append(dictionary.token2id[word])
        docs_idx_list.append(one_doc_idx)

    doc_num = len(docs_list)
    word_num = len(dictionary.keys())

    n_d_k = np.zeros((doc_num, topic_num))
    n_k_w = np.zeros((topic_num, word_num))
    n_k = np.zeros((topic_num, ))

    z = {}

    for d, doc in enumerate(docs_idx_list):
        for w_index, w in enumerate(doc):
            k = np.random.randint(0, topic_num)
            n_d_k[d, k] += 1
            n_k_w[k, w] += 1
            n_k[k] += 1
            z[(d, w_index)] = k

    n_d_k = n_d_k + np.ones((topic_num, )) * alpha
    n_k_w = n_k_w + np.ones((word_num, )) * beta

    theta = np.zeros((doc_num, topic_num))

    for i_pass in range(passes):
        print("I_Pass: {}".format(i_pass))
        for d, doc in enumerate(docs_idx_list):
            theta[d] = np.random.dirichlet(
                n_d_k[d] + np.ones((topic_num, )) * alpha, 1)
            for w_index, w in enumerate(doc):
                word = w
                topic = z[(d, w_index)]

                n_d_k[d, topic] -= 1
                n_k_w[topic, word] -= 1
                n_k[topic] -= 1
                temp_phi = n_k_w[:, word] / n_k
                p_z_k = n_d_k[d] * temp_phi
                # p_z_k = theta[d] * temp_phi

                new_topic = np.random.multinomial(
                    1, p_z_k / np.sum(p_z_k)).argmax()

                z[(d, w_index)] = new_topic
                n_d_k[d, new_topic] += 1
                n_k_w[new_topic, word] += 1
                n_k[new_topic] += 1

    for k_i in range(topic_num):
        print("K: {}".format(k_i))
        arg_list = (n_k_w[k_i]).argsort()[-10:]
        for idx, arg_index in enumerate(list(reversed(arg_list))):
            print(
                "{}: {}".format(dictionary[arg_index],
                                n_k_w[k_i, arg_index] / np.sum(n_k_w[k_i])),
                end="\t")
        print("\n")
Exemplo n.º 20
0
# Print the matrix

wcounts = dict()

for x in occurrences.keys():
    wcounts[x] = 0

for d in corpus:
    for w in d:
        wcounts[w] += 1

graphd = {"nodes": [], "links": []}

ease_id_dict = dict()

for x in dct.keys():
    ease_id_dict[dct[x]] = x

for x in wcounts.keys():
    graphd["nodes"].append({
        "id": x,
        "word": x,
        "count": wcounts[x],
        "linked": 0
    })

edges = dict()
for x in occurrences.keys():
    for y in occurrences[x].keys():
        st1 = x + ":" + y
        st2 = y + ":" + x