def dump_csv(self, filename, columns=DEFAULT_CSV_COLUMNS): """ Dumps the matching tweets to a CSV file specified by `filename`. The default columns are ['id_str', 'user.screen_name', 'timestamp', 'text']. Columns are specified by their path in the tweet dictionary, so that 'user.screen_name' will grab tweet['user']['screen_name'] If `filename` ends with `.gz`, it will be a gzipped file. Else it will be a plaintext (utf8) file. Gzip typically achieves 3x-5x compression on this type of data, depending on the columns chosen and the structure of the data. Example: ######## collection.since(one_hour_ago).dump_csv('my_tweets.csv', columns=['timestamp', 'text']) """ if filename.endswith('.gz'): outfile = gzip.open(filename, 'w') else: outfile = open(filename, 'w') try: writer = UnicodeWriter(outfile) writer.writerow(columns) for tweet in self: writer.writerow(self._make_row(tweet, columns)) finally: outfile.close()
def it_to_csv(it_of_its, outfile, mode="w"): """ Export an iterable of iterables to a CSV. Row x Column. (eg: [["id", "score"], [1, 0.9], [2, 0.73], ...] will give id,score 1,0.9 2,0.73 Can set 'mode' for appending ("a"). Default: write ("w") """ with open(outfile, mode) as handle: writer = UnicodeWriter(handle) for row in it_of_its: writer.writerow(list(row))
text = line[TEXT_INDEX] clean = clean_text(text) tweet_texts.append(clean) tweet_labels.append(label) sys.stdout.write('+') sys.stdout.write('\n') dictionary = defaultdict(lambda: 0) for text in tweet_texts: for word in text.split(): dictionary[word] += 1 sys.stdout.write('.') with open(CLEAN_DATA_FILENAME, 'wb') as outfile: writer = UnicodeWriter(outfile) writer.writerow(['label', 'clean_text']) for label, text in zip(tweet_labels, tweet_texts): writer.writerow([label, text]) sys.stdout.write('-') MIN_D = 20 MAX_D = .5 * len(dictionary) with open(DICTIONARY_COUNT_FILENAME, 'wb') as outfile: writer = UnicodeWriter(outfile) writer.writerow(['word', 'count']) for word in sorted(dictionary.keys()): if dictionary[word] < MIN_D or dictionary[word] > MAX_D: continue writer.writerow([word, dictionary[word]])
text = line[TEXT_INDEX] clean = clean_text(text) tweet_texts.append(clean) tweet_labels.append(label) sys.stdout.write('+') sys.stdout.write('\n') dictionary = defaultdict(lambda: 0) for text in tweet_texts: for word in text.split(): dictionary[word] += 1 sys.stdout.write('.') with open(CLEAN_DATA_FILENAME, 'wb') as outfile: writer = UnicodeWriter(outfile) writer.writerow(['label', 'clean_text']) for label,text in zip(tweet_labels, tweet_texts): writer.writerow([label, text]) sys.stdout.write('-') MIN_D = 20 MAX_D = .5*len(dictionary) with open(DICTIONARY_COUNT_FILENAME, 'wb') as outfile: writer = UnicodeWriter(outfile) writer.writerow(['word', 'count']) for word in sorted(dictionary.keys()): if dictionary[word] < MIN_D or dictionary[word] > MAX_D: continue writer.writerow([word, dictionary[word]])