Пример #1
0
def term_class_count():

    all_classes = list(fetch_20newsgroups(subset='train').target_names)
    all_class_docs = []

    for doc_class in all_classes:
        category_ob = fetch_20newsgroups(subset='train',
                                         categories=[doc_class],
                                         remove=('headers', 'footers',
                                                 'quotes'))
        docs_list = category_ob.data
        per_class_docs = ''

        for doc in docs_list:
            clean_doc = utility.clean_word(doc)
            per_class_docs += ' ' + clean_doc

        all_class_docs.append(per_class_docs)

    count_vector = CountVectorizer()
    freq_matrix = count_vector.fit_transform(all_class_docs)
    #print freq_matrix.shape
    #print freq_matrix

    return freq_matrix
Пример #2
0
def term_class_count():

    all_classes = list(fetch_20newsgroups(subset="train").target_names)
    all_class_docs = []

    for doc_class in all_classes:
        category_ob = fetch_20newsgroups(
            subset="train", categories=[doc_class], remove=("headers", "footers", "quotes")
        )
        docs_list = category_ob.data
        per_class_docs = ""

        for doc in docs_list:
            clean_doc = utility.clean_word(doc)
            per_class_docs += " " + clean_doc

        all_class_docs.append(per_class_docs)

    count_vector = CountVectorizer()
    freq_matrix = count_vector.fit_transform(all_class_docs)
    # print freq_matrix.shape
    # print freq_matrix

    return freq_matrix
Пример #3
0
import json
from pandas.io.json import json_normalize
import codecs
import pandas
import utility

# result_file = codecs.open('../../Datasets/tweets/generated/Tweets_only.csv', 'a')
tweets = []
with codecs.open('../../Datasets/tweets/tweet_data/tweets_#gopatriots.txt'
                 ) as data_file:
    for line in data_file:
        tweet_obj = json_normalize(json.loads(line)['tweet'])
        # data.append(pandas.Series([tweet_obj['text'][0]]))
        tweets.append(utility.clean_word(tweet_obj['text'][0]))
        #result_file.write(unicode(str(tweet_obj['text'][0])))
        #result_file.write('\n')
        # print tweet_obj['text'][0]

# print data.head()
# result_file.close()
data = pandas.Series(tweets)
data.to_csv('../../Datasets/tweets/generated/Tweets_only.csv',
            encoding='utf-8')
Пример #4
0
import json
from pandas.io.json import json_normalize
import codecs
import pandas
import utility

# result_file = codecs.open('../../Datasets/tweets/generated/Tweets_only.csv', 'a')
tweets = []
with codecs.open('../../Datasets/tweets/tweet_data/tweets_#gopatriots.txt') as data_file:
    for line in data_file:
        tweet_obj = json_normalize(json.loads(line)['tweet'])
        # data.append(pandas.Series([tweet_obj['text'][0]]))
        tweets.append(utility.clean_word(tweet_obj['text'][0]))
        #result_file.write(unicode(str(tweet_obj['text'][0])))
        #result_file.write('\n')
        # print tweet_obj['text'][0]

# print data.head()
# result_file.close()
data = pandas.Series(tweets)
data.to_csv('../../Datasets/tweets/generated/Tweets_only.csv', encoding='utf-8')