def remove_rare_often_word(texts, low_value, high_value):
    #removing frequent and rare words
    texts_tokenized = [simple_preprocess(doc) for doc in texts]
    dictionary = Dictionary(texts_tokenized)
    corpus = [dictionary.doc2bow(doc) for doc in texts_tokenized]

    tfidf = TfidfModel(corpus, id2word=dictionary)
    corpus_tfidf = tfidf[corpus]

    bad_words = []
    for sent_tfidf in tqdm(corpus_tfidf, desc="selecting bad words"):
        bad_words += [
            id for id, value in sent_tfidf
            if (value < low_value) or (value > high_value)
        ]

    dictionary.filter_tokens(bad_ids=bad_words)

    out_bow = [dictionary.doc2bow(doc) for doc in texts_tokenized]

    out_corpus = []
    for doc in tqdm(out_bow, desc='Creating out corpus'):
        out_corpus.append([dictionary.get(id) for id, value in doc])

    dict_tfidf = {
        dictionary.get(id): value
        for doc in corpus_tfidf for id, value in doc
        if (value >= low_value) and (value <= high_value)
    }

    return {
        'texts': out_corpus,
        'dict_tfidf': dict_tfidf,
        'dictionary': dictionary
    }
Exemplo n.º 2
0
def getNotImportantTokens(careersCorpus, jobsCorpus):
    careerPostFilter, jobPostFilter = [], []
    dictionary = Dictionary([careersCorpus, jobsCorpus])
    corpus = [dictionary.doc2bow(line) for line in [careersCorpus, jobsCorpus]]
    tfidf = TfidfModel(corpus)
    careersVector = tfidf[corpus[0]]
    jobsVector = tfidf[corpus[1]]
    sorted_careersVector = sorted(careersVector,
                                  key=lambda w: w[1],
                                  reverse=True)
    sorted_jobsVector = sorted(jobsVector, key=lambda w: w[1], reverse=True)
    for word_id, word_count in sorted_careersVector[:30]:
        careerPostFilter.append(dictionary.get(word_id))
    for word_id, word_count in sorted_jobsVector[:30]:
        jobPostFilter.append(dictionary.get(word_id))
    return careerPostFilter, jobPostFilter
Exemplo n.º 3
0
    def produce(self):        

        print('Getting src docs')
        docs = []
        doctokens = [] # aka Gensim's "text"
        stopwords = nltk.corpus.stopwords.words('english')
        for doc in self.src_doc_generator():
            (doc_id,doc_label,doc_str) = doc
            docs.append(doc)
            doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords])
            if len(docs) % 1000 == 0: print(len(docs))
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        #dictionary.compactify()
        #dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating WORD') # aka Gensim's "dictionary"
            db.create_table('word')
            for word_id, word_str in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str))
            
            print('Creating DOC and DOCWORD')
            db.create_table('doc')
            db.create_table('docword')
            for doc_idx, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2]))
                doc_id = doc[0]
                for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])):
                    word_str = dictionary.get(word_id) # Is this valid? I believe it is.
                    db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
Exemplo n.º 4
0
def create_tf_idf(corporaPath):
    """
    Compute the TF-IDF scores based on the entire corpus
    """
    docs = []
    ids = []
    with open(corporaPath, "r") as csvfile:
        for line in csvfile:
            line = line.replace("\n", " ")
            parts = line.split(",")
            if (len(parts) == 5):
                id = parts[0]
                url = parts[1]
                comments = parts[2]
                if comments is not None:
                    comments = clean_text(comments, tknzr_strip_users)
                else:
                    comments = []
                caption = parts[3]
                if caption is not None:
                    caption = clean_text(caption, tknzr_strip_users)
                else:
                    caption = []
                tags = parts[4]
                if tags is not None:
                    tags = clean_text(tags, tknzr_strip_users)
                else:
                    tags = []
            docs.append(comments + caption + tags)
            ids.append(id)
    idx_to_id = {}
    for i, id in enumerate(ids):
        idx_to_id[i] = id

    dct = Dictionary(docs)
    corpus = [dct.doc2bow(line) for line in docs]
    model = TfidfModel(corpus)
    tfidf_factors = {}
    for i, doc in enumerate(corpus):
        temp = {}
        for word_id, value in model[doc]:
            word = dct.get(word_id)
            temp[word] = value
        tfidf_factors[idx_to_id[i]] = temp
    return tfidf_factors
Exemplo n.º 5
0
def make_tf_time_series(tweets_time_series, keep_only_common_words=True):
    tweets_time_series = break_up_sentences(tweets_time_series)
    tweets_dict = Dictionary(tweets_time_series)
    bow_time_series = [
        tweets_dict.doc2bow(tweets) for tweets in tweets_time_series
    ]
    tf_time_series = [
        make_term_frequency(time_step) for time_step in bow_time_series
    ]
    tf_time_series = [[(tweets_dict.get(tup[0]), tup[1]) for tup in time_step]
                      for time_step in tf_time_series]
    if keep_only_common_words:
        tweets_dict.filter_extremes(no_below=len(tweets_time_series),
                                    no_above=1)
        tf_time_series = [[
            tup for tup in time_step if tweets_dict.doc2idx([tup[0]])[0] != -1
        ] for time_step in tf_time_series]
    return tf_time_series
def get_tokens_frequency_df(series):
    """
    Count each time the same word appeared in the series and returns a dataFrame
    """
    corpus_lists = [doc for doc in series.dropna() if doc]
    dictionary = Dictionary(corpus_lists)
    corpus_bow = [dictionary.doc2bow(doc) for doc in corpus_lists]
    token_freq_bow = defaultdict(int)
    for token_id, token_sum in itertools.chain.from_iterable(corpus_bow):
        token_freq_bow[token_id] += token_sum

    return pd.DataFrame(
        list(token_freq_bow.items()),
        columns=['token_id', 'token_count']).assign(
            token=lambda df1: df1.apply(
                lambda df2: dictionary.get(df2.token_id), axis=1),
            doc_appeared=lambda df1: df1.
            apply(lambda df2: dictionary.dfs[df2.token_id], axis=1)).reindex(
                labels=['token_id', 'token', 'token_count', 'doc_appeared'],
                axis=1).set_index('token_id')
Exemplo n.º 7
0
def ner_post_parsing(classification, blogs):
    tokenized_docs = [word_tokenize(doc) for doc in blogs]
    dictionary = Dictionary(tokenized_docs)
    bag_of_words_corpus = [
        dictionary.doc2bow(tokenized_doc) for tokenized_doc in tokenized_docs
    ]

    total_word_count = defaultdict(int)
    for word_id, word_count in itertools.chain.from_iterable(
            bag_of_words_corpus):
        total_word_count[word_id] += word_count

    # Create a sorted list from the defaultdict: sorted_word_count
    sorted_word_count = sorted(total_word_count.items(),
                               key=lambda w: w[1],
                               reverse=True)

    # Print the top 1 words across all documents alongside the count
    for word_id, word_count in sorted_word_count[:1]:
        print(classification + " is talking about '" +
              str(dictionary.get(word_id)) + "', with " + str(word_count) +
              " occurence")
def analyze(originfile, all=False):
    keywords = helper.getKeywords(originfile)
    os.chdir('./resources/stanford-corenlp-full-2018-10-05')
    os.system('kill $(lsof -t -i:9000)')
    cmd = 'java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 10000000000000 &'
    time.sleep(4)
    print("starting nlp service")
    with open(os.devnull, "w") as f:
        subprocess.call(cmd, shell=True, stderr=f, stdout=f)
    time.sleep(4)
    print("nlp service started")
    os.chdir('../../')
    nlp_wrapper = StanfordCoreNLP('http://localhost:9000')
    print("Number of processors: ", mp.cpu_count())
    if all:
        print("all")
        '''if not os.path.isfile('/resources/all_test.csv'):
            print("test file created")
            open('./resources/all_test.csv', 'w').close()'''
        conn = db.db_connection()
        dbo = db.db_operator(conn)
        spell = SpellChecker()
        counter = Value('i', 1)
        corpus_tok_all=[]
        '''for i in range(1790):
            print('i=' +str(i))
            print("limit= 10000")
            print("offset= "+str(10000*i))
            conn.connect()
            query = 'SELECT reviews.ReviewID, reviews.Country as \'Tourist_Country\', ' \
                    'hotels.CountryID as \'Hotel Country\', Good, reviews.Bad ' \
                    'FROM masterthesis.reviews, masterthesis.hotels ' \
                    'where hotels.HotelNumber=reviews.HotelNumber limit 10000 offset '+str(10000*i)+';'
            results = [list(x) for x in dbo.execute(query)];
            conn.disconnect()
            print("got results from sql")
            print("starting analysis")
            print("tot number rows= " + str(len(results)))
            try:
                print('analyzing 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,initargs=(counter, spell, nlp_wrapper,), )
                corpus_tok = pool.map_async(thread_function_row_only_all, [doc for doc in results]).get(timeout=1200)
                pool.close()
                pool.terminate()
                pool.join()
                print('got corpus_tok for 10000 rows '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            except TimeoutError:
                print("timeout error")
                pool.close()
                pool.terminate()
                pool.join()
                corpus_tok=[]
                for doc in results:
                    try:
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        c=pool.map_async(thread_function_row_only_all, [doc]).get(timeout=60)
                        #print('pool close')
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()

                    except TimeoutError:
                        print(str(doc)+" caused Exception")
                        pool.close()
                        pool.terminate()
                        #print('pool join')
                        pool.join()
                        c=[None]
                    corpus_tok.append(c[0])
            print("beginning removal of sents with contrast")
            corpus_tok = [r for r in corpus_tok if r != None]
            print('len corpus_tok_reduced= '+str(len(corpus_tok)))
            corpus_tok_all+=corpus_tok
            print('len corpus_tok_all= ' + str(len(corpus_tok_all)))
            if i%100==0 and i!=0:
                with open('./resources/all_test.csv', mode='a') as file:
                    writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    for c in corpus_tok_all:
                        writer.writerow(c)
                file.close()
                corpus_tok_all=[]
        '''


        '''
        corpus_tok_all=[]
        i=0
        kk=set()
        with open('./resources/all_test.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                #if i%10000==0:break
                ar=((row[0].replace('[','')).replace(']','')).split(',')
                if ar[1][-1]!="'":#France, Metro.
                    ar[1]=ar[1]+','+ar[2]
                    for j in range(2,len(ar)-1):
                        ar[j]=ar[j+1]
                    del ar[len(ar)-1]
                ar[1]=ar[1][2:-1]
                ar[2] = (ar[2].replace("'", '')).replace(' ', '')
                rev=''.join(ar[3:])
                revlist= ar[:3]
                revlist.append(rev)
                tokens = ((((row[1].replace(']', '')).replace('[','')).replace("'",'')).replace(" ",'')).split(',')
                r=(revlist,tokens)
                k=ar[0]
                if k not in kk:
                    kk.add(k)
                    corpus_tok_all.append(r)
        file.close()
        corpus_tok=corpus_tok_all
        corpustokonly = [r[1] for r in corpus_tok]
        print("doing bigrams")
        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
        lenc=len(corpus_tok)
        print("corpus_tok len = "+str(lenc))
        for idx in range(lenc):
            if idx%100000==0:
                print(idx)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            for token in bigram[corpustokonly[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    corpus_tok[idx][1].append(token)
        with open('./resources/corpus_tok_all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                writer.writerows(corpus_tok)
        file.close()
        print("corpus_tok written")
        from gensim.corpora import Dictionary
        print("writing frequence file")
        '''

        



        '''all_set=set()
        for emotion in ['Good', 'Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if not (keyword == 'cleaning' or keyword=='pet'):
                    start_time = time.time()
                    print(keyword + ' ---- ' + time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    raw_corpus = helper.getRawCorpus(
                        csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                      encoding="utf8", newline='\n'), additionaldetails=True)
                    # corpus = helper.getCorpusTextFromRaw(raw_corpus)
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    print("starting analysis")
                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2,
                                   initargs=(counter, spell, nlp_wrapper,), )
                    corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get()
                    print('pool close')
                    pool.close()
                    print('pool join')
                    pool.join()
                    print("beginning removal of sents with contrast")
                    corpus_tok = [r for r in corpus_tok if r != None]
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    print('len all_set_tok before= ' + str(len(all_set)))
                    print('len corpus_tok= ' + str(len(corpus_tok)))
                    print('len corpus_tok+all_set_tok= ' + str(len(corpus_tok) + len(all_set)))
                    for sen in corpus_tok:
                        all_set.add((tuple(sen[0]),tuple(sen[1])))
                    print('len all_set_tok after= ' + str(len(all_set)))
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
        # Compute bigrams.
        if len(all_set) > 0:
            corpus_tok=[(list(x[0]),list(x[1])) for x in all_set]
            corpustokonly = [r[1] for r in corpus_tok]
            print("doing bigrams")
            # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
            bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
            for idx in range(len(corpus_tok)):
                for token in bigram[corpustokonly[idx]]:
                    if '_' in token:
                        # Token is a bigram, add to document.
                        corpus_tok[idx][1].append(token)
            from gensim.corpora import Dictionary
            print("writing frequence file")

            # Create a dictionary representation of the documents.
            dictionary = Dictionary(corpustokonly)

            alltok = []
            freq = []
            for doc in corpustokonly:
                for tok in doc:
                    alltok.append(tok)
            lencorpus = len(corpus_tok)
            print("len dictionary = " + str(len(dictionary.keys())))
            i = 0
            for t in dictionary:
                i += 1
                if i % 1000 == 0:
                    print("analyzing token " + str(i))
                freqsent = 0
                for doc in corpustokonly:
                    if dictionary.get(t) in doc:
                        freqsent += 1
                freq.append((t, dictionary.get(t), alltok.count(dictionary.get(t)),
                             alltok.count(dictionary.get(t)) / len(alltok), freqsent, freqsent / lencorpus))
            freq.sort(key=lambda tup: tup[5], reverse=True)
            for i in range(len(freq)):
                freq[i] = tuple(list(freq[i]) + [i])
            if not os.path.exists('resources/bow/allfreq/stanford/'):
                os.makedirs('resources/bow/allfreq/stanford/')
            with open('resources/bow/allfreq/stanford/all.txt',
                      'w') as f:
                for item in freq:
                    f.write(str(item) + '\n')
                f.close()

            print("writing bow file")
            top_tokens = [f[1] for f in freq[:500]]
            lentoptok = len(top_tokens)
            corpus_bow = {}
            toplen = 0
            for i in range(len(corpus_tok)):
                corpus_bow[i] = [0] * lentoptok
                if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                    toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
                for tok in corpus_tok[i][1]:
                    if tok in top_tokens:
                        corpus_bow[i][top_tokens.index(tok)] = 1

            with open('resources/bow/all.csv', mode='w') as file:
                writer = csv.writer(file, delimiter='|', quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow([''] * toplen + top_tokens)
                for i in corpus_bow.keys():
                    writer.writerow(corpus_tok[i][0] + corpus_tok[i][1] + [''] * (
                            toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) + corpus_bow[i])
            file.close()
        '''
        


        # Create a dictionary representation of the documents.
        '''dictionary = Dictionary(corpustokonly)

        alltok = []
        freq = []
        for doc in corpustokonly:
            for tok in doc:
                alltok.append(tok)
        lencorpus = len(corpus_tok)
        print("len dictionary = " + str(len(dictionary.keys())))
        time.sleep(100000)
        counter = Value('i', 0)
        pool = mp.Pool(initializer=init_globals_token_analyzer, processes=mp.cpu_count(), initargs=(counter,corpustokonly,dictionary,lencorpus,alltok), )
        print("pool initialized")
        corpustokonly=None
        alltok=None
        del corpustokonly, alltok
        freq = pool.map_async(thread_function_row_only_token_analyzer, [t for t in dictionary]).get()
        pool.close()
        pool.terminate()
        pool.join()
        dictionary=None
        del dictionary
        global ctonly, dic, alltoks
        ctonly=None
        dic=None
        alltoks=None
        del ctonly,dic,alltoks
        print("frequence list len= "+str(len(freq)))
        print("frequence list created")
        freq.sort(key=lambda tup: tup[5], reverse=True)
        print("frequence list sorted")
        for i in range(len(freq)):
            if i%10000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            freq[i] = tuple(list(freq[i]) + [i])
        print("frequence list modified")
        if not os.path.exists('resources/bow/allfreq/stanford/'):
            os.makedirs('resources/bow/allfreq/stanford/')
        i=0
        '''
        '''with open('resources/bow/allfreq/stanford/all.txt', 'w') as f:
            for item in freq:
                i+=1
                if i%10000==0:
                    print(i)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                f.write(str(item) + '\n')
            f.close()'''

        corpus_tok=[]
        i=0
        with open('./resources/corpus_tok_all.csv', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i%100000==0:
                    print(i)
                corpus_tok.append(row)
        file.close()
        print("len corpus_tok= "+str(len(corpus_tok)))
        freq=[]
        i=0
        with open('./resources/bow/allfreq/stanford/all.txt', mode='r') as file:
            reader = csv.reader(file, delimiter='|', quotechar='"')
            for row in reader:
                i+=1
                if i==501:break
                freq.append(row)
        file.close()
        for i in range(len(freq)):
            freq[i]=freq[i][0]
            freq[i]=freq[i].replace("'",'')
            freq[i]=freq[i].replace('"','')
            freq[i]=freq[i].replace('(','')
            freq[i]=freq[i].replace(')','')
            freq[i]=freq[i].replace(' ','')
            freq[i]=freq[i].split(',')
            freq[i]=tuple(freq[i])
        for i in range(len(corpus_tok)):
            if i%100000==0:
                print(i)
            corpus_tok[i][0]=corpus_tok[i][0].replace('[','')
            corpus_tok[i][0]=corpus_tok[i][0].replace(']','')
            det=(corpus_tok[i][0].split(','))
            if 'São Tomé' in det[1]:#São Tomé and PrÃ\\\\xadncipe
                det[1]='  '+'São Tomé and PrÃ\xadncipe'+' '
            if det[1][-1]!="'":#France, Metro
                if 'Ivoire' in det[1]:#Cote d'Ivoire
                    det[1]=det[1].replace('\\','')
                    det[2]=det[2][1:]
                else:
                    det[1]=det[1]+','+det[2]
                    for j in range(2,len(det)-1):
                        det[j]=det[j+1]
                    del det[len(det)-1]
            det=det[:3]
            desc=(corpus_tok[i][0].split(','))[-1]
            det[0]=det[0][1:-1]
            det[1]=det[1][2:-1]
            det[2]=det[2][2:-1]
            desc=desc[3:-1]
            det.append(desc)
            corpus_tok[i][0]=det
            corpus_tok[i][1]=corpus_tok[i][1].replace("'",'')
            corpus_tok[i][1]=corpus_tok[i][1].replace(' ','')
            corpus_tok[i][1]=corpus_tok[i][1].replace('[','')
            corpus_tok[i][1]=corpus_tok[i][1].replace(']','')
            corpus_tok[i][1]=corpus_tok[i][1].split(',')
        print("writing bow file")
        top_tokens = [f[1] for f in freq[:400]]
        lentoptok = len(top_tokens)
        corpus_bow = {}
        toplen = 0
        print("corpus_tok_len= "+str(len(corpus_tok)))
        for i in range(len(corpus_tok)):
            corpus_bow[i] = [0] * lentoptok
            if i%100000==0:
                print(i)
                print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
            if len(corpus_tok[i][0] + corpus_tok[i][1]) > toplen:
                toplen = len(corpus_tok[i][0] + corpus_tok[i][1])
            for tok in corpus_tok[i][1]:
                if tok in top_tokens:
                    corpus_bow[i][top_tokens.index(tok)] = 1
        print("len corpus_bow keys= "+str(len(corpus_bow.keys())))
        print("got corpus_bow")
        j=0
        print("corpus_bow_len "+str(len(corpus_bow)))
        with open('resources/bow/all.csv', mode='w') as file:
            writer = csv.writer(file, delimiter='|', quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow([''] * toplen + top_tokens)
            for i in corpus_bow.keys():
                j+=1
                if j%100000==0:
                    print(j)
                    print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                writer.writerow(
                    corpus_tok[i][0] + corpus_tok[i][1] + [''] * (toplen - len(corpus_tok[i][0] + corpus_tok[i][1])) +
                    corpus_bow[i])
        file.close()
        print("over")
    else:
        print("not all")
        for emotion in ['Good','Bad']:
            print("begin " + emotion)
            for keyword in list(keywords.keys()):
                if emotion=='Good' and keyword=='cleaning':#cleaning good
                    start_time = time.time()
                    print(keyword+' ---- '+time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
                    spell = SpellChecker()
                    counter = Value('i', 1)
                    corpus_tok_all=[]
                    #if not os.path.isfile('/resources/cleaning_test.csv'):
                        #open('./resources/cleaning_test.csv', 'w').close()
                    for i in range(400):#400
                        print(str(i))
                        offset=i*1000
                        limit=1000
                        print("starting reading")
                        print("limit="+str(limit))
                        print("offset="+str(offset))
                        raw_corpus = helper.getRawCorpus(
                            csv_file=open('resources/csvs/' + keyword + '_' + emotion.lower() + '.csv', mode='r',
                                          encoding="utf8", newline='\n'), additionaldetails=True, limit=limit, offset=offset)

                        #corpus = helper.getCorpusTextFromRaw(raw_corpus)
                        #raw_corpus_half_one = raw_corpus[:int(len(raw_corpus) / 2)]
                        #raw_corpus_half_two=raw_corpus[int(len(raw_corpus)/2):]
                        print("starting analysis")
                        pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                        try:
                            corpus_tok = pool.map_async(thread_function_row_only, [doc for doc in raw_corpus]).get(timeout=30)
                            pool.close()
                            pool.join()
                        except TimeoutError:
                            print("timeout error")
                            print('pool close')
                            pool.close()
                            print('pool terminate')
                            pool.terminate()
                            print('pool join')
                            pool.join()
                            corpus_tok=[]
                            for doc in raw_corpus:
                                try:
                                    pool = mp.Pool(initializer=init_globals, processes=mp.cpu_count() * 2, initargs=(counter,spell,nlp_wrapper,), )
                                    c=pool.map_async(thread_function_row_only, [doc]).get(timeout=30)
                                    #print('pool close')
                                    pool.close()
                                    #print('pool join')
                                    pool.join()
                                    '''thread = threading.Thread(target = thread_function_row_only, args = (doc))
                                    thread.start()
                                    thread.join()
                                    c=que.get()'''
                                except TimeoutError:
                                    print(str(doc)+" caused Exception")
                                    c=[None]
                                corpus_tok.append(c[0])
                        corpus_tok_reduced=[r for r in corpus_tok if r != None]
                        print("len corpus_tok: " + str(len(corpus_tok)))
                        print("len corpus_tok_reduced: " + str(len(corpus_tok_reduced)))
                        '''with open('./resources/cleaning_test.csv', mode='a') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                            for c in corpus_tok_reduced:
                                writer.writerow(c)
                        file.close()'''
                        corpus_tok_all+=corpus_tok_reduced
                        print("len corpus_tok_all: " + str(len(corpus_tok_all)))
                    '''
                    corpus_tok=[]
                    s=0
                    for doc in corpus:
                        newdoc=False
                        doc = doc.lower()
                        s += 1
                        if s % 10000 == 0:
                            print(str(s))
                        for con in constr_conjs:
                            if con in doc:
                                newdoc=True
                                break
                        if not newdoc:
                            toks = [spell.correction(tok['lemma']) for tok in
                                    nlp_wrapper.annotate(doc,
                                                         properties={'annotators': 'lemma, pos', 'outputFormat': 'json', })[
                                        'sentences'][0]['tokens']
                                    if tok['pos'] in ['NNS', 'NN'] and len(tok['lemma']) > 1]
                            toapp = []
                            for i in range(len(toks)):
                                if '/' in toks[i]:
                                    for tok in toks[i].split('/'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            toapp = []
                            for i in range(len(toks)):
                                if '-' in toks[i]:
                                    for tok in toks[i].split('-'):
                                        toapp.append(tok)
                            for tok in toapp:
                                toks.append(tok)
                            corpus_tok.append(toks)'''
                    #print("beginning removal of sents with contrast")
                    corpus_tok=corpus_tok_all
                    print("len corpus_tok: " + str(len(corpus_tok)))
                    ###############################################################################
                    # We find bigrams in the documents. Bigrams are sets of two adjacent words.
                    # Using bigrams we can get phrases like "machine_learning" in our output
                    # (spaces are replaced with underscores); without bigrams we would only get
                    # "machine" and "learning".
                    #
                    # Note that in the code below, we find bigrams and then add them to the
                    # original data, because we would like to keep the words "machine" and
                    # "learning" as well as the bigram "machine_learning".
                    #
                    # .. Important::
                    #     Computing n-grams of large dataset can be very computationally
                    #     and memory intensive.
                    #
                    # Compute bigrams.
                    if len(corpus_tok)>0:
                        corpustokonly=[r[1] for r in corpus_tok]
                        print("doing bigrams")
                        # Add bigrams and trigrams to docs (only ones that appear 10 times or more).
                        bigram = Phrases(corpustokonly, min_count=0.001 * len(corpus_tok))
                        for idx in range(len(corpus_tok)):
                            for token in bigram[corpustokonly[idx]]:
                                if '_' in token:
                                    # Token is a bigram, add to document.
                                    corpus_tok[idx][1].append(token)
                        from gensim.corpora import Dictionary
                        print("writing frequence file")

                        # Create a dictionary representation of the documents.
                        dictionary = Dictionary(corpustokonly)

                        alltok = []
                        freq=[]
                        for doc in corpustokonly:
                            for tok in doc:
                                alltok.append(tok)
                        lencorpus=len(corpus_tok)
                        print("len dictionary = "+str(len(dictionary.keys())))
                        i=0
                        for t in dictionary:
                            i+=1
                            if i%1000==0:
                                print("analyzing token "+str(i))
                            freqsent = 0
                            for doc in corpustokonly:
                                if dictionary.get(t) in doc:
                                    freqsent+=1
                            freq.append((t,dictionary.get(t),alltok.count(dictionary.get(t)),alltok.count(dictionary.get(t))/len(alltok),freqsent,freqsent/lencorpus))
                        freq.sort(key=lambda tup: tup[5], reverse=True)
                        for i in range(len(freq)):
                            freq[i]=tuple(list(freq[i])+[i])
                        if not os.path.exists('resources/bow/allfreq/stanford/'):
                            os.makedirs('resources/bow/allfreq/stanford/')
                        with open('resources/bow/allfreq/stanford/'+keyword+'_'+emotion.lower()+'.txt', 'w') as f:
                            for item in freq:
                                f.write(str(item)+'\n')
                            f.close()

                        print("writing bow file")
                        top_tokens=[f[1] for f in freq[:500]]
                        lentoptok=len(top_tokens)
                        corpus_bow={}
                        toplen=0
                        for i in range(len(corpus_tok)):
                            corpus_bow[i]=[0]*lentoptok
                            if len(corpus_tok[i][0]+corpus_tok[i][1])>toplen:
                                toplen=len(corpus_tok[i][0]+corpus_tok[i][1])
                            for tok in corpus_tok[i][1]:
                                if tok in top_tokens:
                                    corpus_bow[i][top_tokens.index(tok)]=1

                        with open('resources/bow/'+keyword+'_'+emotion.lower()+'.csv', mode='w') as file:
                            writer = csv.writer(file, delimiter='|', quotechar='"',
                                                         quoting=csv.QUOTE_MINIMAL)
                            writer.writerow(['']*toplen+top_tokens)
                            for i in corpus_bow.keys():
                                writer.writerow(corpus_tok[i][0]+corpus_tok[i][1]+['']*(toplen-len(corpus_tok[i][0]+corpus_tok[i][1]))+corpus_bow[i])
                        file.close()
                    print('------------------------------------------------------')
                    print(str(time.time() - start_time) + ' seconds to compute ' + keyword + ' ' + emotion)
    f.close()
topics_str = '{' + ','.join(topics) + '}'
print('@ATTRIBUTE class {}'.format(topics_str))

print('@DATA')

for topic in topics:
    for doc in docs[topic]:
        total_words = 0

        doc_vec = np.zeros(dim_vec)

        # Calculate the weights
        tfidf_vector = model[dct.doc2bow(doc.split(' '))]
        for wid, weight in tfidf_vector:
            try:
                word = dct.get(wid)
                doc_vec += np.array(word2vec.wv[word] * weight)
                total_words += weight
            except:
                pass

        # Agora é necessário dividir o vetor pela quantidade de palavras para
        # normalizar os valores
        doc_vec = doc_vec / total_words

        # Nesse ponto nós temos o vetor do documento
        attributes = doc_vec.tolist()
        attributes.append(topic)
        print(','.join(str(x) for x in attributes))
    def create_dictionary(self):
        YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None)
        SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get(
            "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None)
        SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None)
        SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None)
        SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int(
            config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000))

        if not (YELP_DATASET_DIR and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY
                and SAVE_DICTIONARY_DIR and SAVE_BAG_OF_WORDS_DIR
                and SAVE_DICTIONARY_DIR):
            print(
                "config keys are not set correctly in the config file: socialconfig.py"
            )
            exit(0)

        SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR,
                                                      "Unfiltered")

        if not os.path.exists(
                SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir(
                    SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):
            raise ("Directory {d} does not exist".format(
                d=SAVE_REVIEWS_BY_CATEGORY_DIRECTORY))

        if not (os.path.exists(SAVE_BAG_OF_WORDS_DIR)
                and os.path.isdir(SAVE_BAG_OF_WORDS_DIR)):
            os.makedirs(SAVE_BAG_OF_WORDS_DIR)

        if not (os.path.exists(SAVE_UNFILTERED_DICTIONARY_DIR)
                and os.path.isdir(SAVE_UNFILTERED_DICTIONARY_DIR)):
            os.makedirs(SAVE_UNFILTERED_DICTIONARY_DIR)

        for pardir, sub_dirs, files in os.walk(
                SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):

            if len(files) > 0:
                error_count = 0
                review_docs = []
                negative_docs = []
                positive_docs = []

                doc_count = 0
                docs_per_file = SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE
                file_num = str((doc_count / docs_per_file) + 1)
                for file in files:
                    if "yelp_reviews_" in file and "category" in pardir:
                        reviews = get_reviews_iterable(
                            os.path.join(pardir, file))
                        yelp_category = pardir.split('/')[-1]

                        CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR = os.path.join(
                            SAVE_BAG_OF_WORDS_DIR, yelp_category)
                        if not (os.path.exists(
                                CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) and os.path
                                .isdir(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)):
                            os.makedirs(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)

                        fname = os.path.join(
                            SAVE_BAG_OF_WORDS_DIR, yelp_category,
                            "{cat}_file_{file_num}.txt".format(
                                cat=yelp_category, file_num=file_num))
                        bow_file = open(fname, 'w')
                        print(
                            "Writing docs (in bag of words form) for {cat} to directory: {d}"
                            .format(cat=yelp_category,
                                    d=os.path.join(SAVE_BAG_OF_WORDS_DIR,
                                                   yelp_category)))
                        for review in reviews:
                            try:
                                review_dict = ujson.loads(review)
                            except:
                                error_count += 1
                                pass
                            adjs = review_dict.get("adjectives", None)
                            rating = int(review_dict.get("rating", -1))
                            if adjs:
                                doc_count += 1
                                bow_file.write(
                                    ujson.dumps(adjs.encode("utf-8")) + "\n")
                                review_docs.append(adjs.strip().split())
                                if (doc_count % docs_per_file) == 0:
                                    if bow_file:
                                        bow_file.close()
                                    file_num = str((doc_count /
                                                    docs_per_file) + 1)
                                    fname = os.path.join(
                                        SAVE_BAG_OF_WORDS_DIR, yelp_category,
                                        "{cat}_file_{file_num}.txt".format(
                                            cat=yelp_category,
                                            file_num=file_num))
                                    bow_file = open(fname, 'w')
                            if rating:
                                if rating > 3:
                                    positive_docs.append(adjs.strip().split())
                                elif rating < 3:
                                    negative_docs.append(adjs.strip().split())
                                else:
                                    pass
                print("Wrote {total} docs in {cat} category".format(
                    total=str(doc_count), cat=yelp_category))

                dictionary = Dictionary(review_docs)

                CATEGORY_SPECIFIC_DICT_DIR = os.path.join(
                    SAVE_UNFILTERED_DICTIONARY_DIR, yelp_category)
                POSITIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR,
                                                "positive")
                NEGATIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR,
                                                "negative")
                if not (os.path.exists(CATEGORY_SPECIFIC_DICT_DIR)
                        and os.path.isdir(CATEGORY_SPECIFIC_DICT_DIR)):
                    os.makedirs(CATEGORY_SPECIFIC_DICT_DIR)
                    os.makedirs(POSITIVE_SUB_DIR)
                    os.makedirs(NEGATIVE_SUB_DIR)

                dictionary.save(
                    os.path.join(
                        CATEGORY_SPECIFIC_DICT_DIR,
                        "{yelp_category}_dict.dict".format(
                            yelp_category=yelp_category)))
                dictionary.save_as_text(
                    os.path.join(
                        CATEGORY_SPECIFIC_DICT_DIR,
                        "{yelp_category}_dict.txt".format(
                            yelp_category=yelp_category)))
                sorted_doc_freqs = sorted(dictionary.dfs.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

                # print("Will save file in:\n " + os.path.join(CATEGORY_SPECIFIC_DICT_DIR,"{yelp_category}_dict.txt".format(yelp_category=yelp_category)))
                with open(
                        os.path.join(
                            CATEGORY_SPECIFIC_DICT_DIR,
                            "{yelp_category}_words_doc_frequencies.txt".format(
                                yelp_category=yelp_category)), 'w') as df_file:
                    for (token_id, doc_freq) in sorted_doc_freqs:
                        df_file.write(
                            str(
                                dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del dictionary
                del review_docs
                del sorted_doc_freqs

                pos_dictionary = Dictionary(positive_docs)
                del positive_docs

                neg_dictionary = Dictionary(negative_docs)
                del negative_docs

                pos_dictionary.save(
                    os.path.join(
                        POSITIVE_SUB_DIR,
                        "{yelp_category}_pos_dict.dict".format(
                            yelp_category=yelp_category)))
                pos_dictionary.save_as_text(
                    os.path.join(
                        POSITIVE_SUB_DIR,
                        "{yelp_category}_pos_dict.txt".format(
                            yelp_category=yelp_category)))

                sorted_pos_doc_freqs = sorted(pos_dictionary.dfs.items(),
                                              key=lambda x: x[1],
                                              reverse=True)
                with open(
                        os.path.join(
                            POSITIVE_SUB_DIR,
                            "{yelp_category}_pos_words_doc_frequencies.txt".
                            format(yelp_category=yelp_category)),
                        'w') as df_file:
                    for (token_id, doc_freq) in sorted_pos_doc_freqs:
                        df_file.write(
                            str(
                                pos_dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del pos_dictionary
                del sorted_pos_doc_freqs

                neg_dictionary.save(
                    os.path.join(
                        NEGATIVE_SUB_DIR,
                        "{yelp_category}_neg_dict.dict".format(
                            yelp_category=yelp_category)))
                neg_dictionary.save_as_text(
                    os.path.join(
                        NEGATIVE_SUB_DIR,
                        "{yelp_category}_neg_dict.txt".format(
                            yelp_category=yelp_category)))
                sorted_neg_doc_freqs = sorted(neg_dictionary.dfs.items(),
                                              key=lambda x: x[1],
                                              reverse=True)
                with open(
                        os.path.join(
                            NEGATIVE_SUB_DIR,
                            "{yelp_category}_neg_words_doc_frequencies.txt".
                            format(yelp_category=yelp_category)),
                        'w') as df_file:
                    for (token_id, doc_freq) in sorted_neg_doc_freqs:
                        df_file.write(
                            str(
                                neg_dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del neg_dictionary
                del sorted_neg_doc_freqs

                print(
                    "{count} {cat} reviews were discarded because of parsing errors"
                    .format(count=error_count, cat=yelp_category))
                print("Created dictionary for {cat} tokens".format(
                    cat=yelp_category))
Exemplo n.º 11
0
#creating dictionary and tfidf model
tokenized_corpus = RedditCorpus(path)
documents = list(itertools.chain(tokenized_corpus))
dct = Dictionary(documents)  # fit dictionary
corpus = [dct.doc2bow(doc) for doc in documents]  # convert dataset to BoW format
model = TfidfModel(corpus, id2word=dct)  # fit model
vector = model[corpus]

#write scores to files
subjects = []
file_glob = os.path.join(path, '*.txt')
subjects.extend(gfile.Glob(file_glob))
for subject in subjects:
    doc = model[corpus[step]]
    d = {dct.get(token_id): tfidf for token_id,tfidf in doc}
    output_file = open(output_path+"\\"+subject.split("\\")[-1],"w") 
    output_file.write(json.dumps(d))  
    output_file.close() 

#creating tf-idf matrix
tfidf_dense = matutils.corpus2dense(model[corpus], num_terms).T 

#creating random forrest classifier for extracting important features
num_terms = len(model[corpus].obj.idfs)
X_train, X_test, y_train, y_test = train_test_split(tfidf_dense, labels, test_size=0.4, random_state=0)
clf = RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=1)
clf.fit(X_train, y_train)
importances = clf.feature_importances_
sorted_i = sorted(importances)
y_pred = clf.predict(X_test)