def term_class_count(): all_classes = list(fetch_20newsgroups(subset='train').target_names) all_class_docs = [] for doc_class in all_classes: category_ob = fetch_20newsgroups(subset='train', categories=[doc_class], remove=('headers', 'footers', 'quotes')) docs_list = category_ob.data per_class_docs = '' for doc in docs_list: clean_doc = utility.clean_word(doc) per_class_docs += ' ' + clean_doc all_class_docs.append(per_class_docs) count_vector = CountVectorizer() freq_matrix = count_vector.fit_transform(all_class_docs) #print freq_matrix.shape #print freq_matrix return freq_matrix
def term_class_count(): all_classes = list(fetch_20newsgroups(subset="train").target_names) all_class_docs = [] for doc_class in all_classes: category_ob = fetch_20newsgroups( subset="train", categories=[doc_class], remove=("headers", "footers", "quotes") ) docs_list = category_ob.data per_class_docs = "" for doc in docs_list: clean_doc = utility.clean_word(doc) per_class_docs += " " + clean_doc all_class_docs.append(per_class_docs) count_vector = CountVectorizer() freq_matrix = count_vector.fit_transform(all_class_docs) # print freq_matrix.shape # print freq_matrix return freq_matrix
import json from pandas.io.json import json_normalize import codecs import pandas import utility # result_file = codecs.open('../../Datasets/tweets/generated/Tweets_only.csv', 'a') tweets = [] with codecs.open('../../Datasets/tweets/tweet_data/tweets_#gopatriots.txt' ) as data_file: for line in data_file: tweet_obj = json_normalize(json.loads(line)['tweet']) # data.append(pandas.Series([tweet_obj['text'][0]])) tweets.append(utility.clean_word(tweet_obj['text'][0])) #result_file.write(unicode(str(tweet_obj['text'][0]))) #result_file.write('\n') # print tweet_obj['text'][0] # print data.head() # result_file.close() data = pandas.Series(tweets) data.to_csv('../../Datasets/tweets/generated/Tweets_only.csv', encoding='utf-8')
import json from pandas.io.json import json_normalize import codecs import pandas import utility # result_file = codecs.open('../../Datasets/tweets/generated/Tweets_only.csv', 'a') tweets = [] with codecs.open('../../Datasets/tweets/tweet_data/tweets_#gopatriots.txt') as data_file: for line in data_file: tweet_obj = json_normalize(json.loads(line)['tweet']) # data.append(pandas.Series([tweet_obj['text'][0]])) tweets.append(utility.clean_word(tweet_obj['text'][0])) #result_file.write(unicode(str(tweet_obj['text'][0]))) #result_file.write('\n') # print tweet_obj['text'][0] # print data.head() # result_file.close() data = pandas.Series(tweets) data.to_csv('../../Datasets/tweets/generated/Tweets_only.csv', encoding='utf-8')