def dump_csv(self, filename, columns=DEFAULT_CSV_COLUMNS):
        """
        Dumps the matching tweets to a CSV file specified by `filename`.
        The default columns are ['id_str', 'user.screen_name', 'timestamp', 'text'].
        Columns are specified by their path in the tweet dictionary, so that
            'user.screen_name' will grab tweet['user']['screen_name']

        If `filename` ends with `.gz`, it will be a gzipped file. Else it will be a plaintext (utf8) file.
        Gzip typically achieves 3x-5x compression on this type of data, depending on the columns chosen and the structure of the data.

        Example:
        ########
        collection.since(one_hour_ago).dump_csv('my_tweets.csv', columns=['timestamp', 'text'])
        """
        if filename.endswith('.gz'):
            outfile = gzip.open(filename, 'w')
        else:
            outfile = open(filename, 'w')
        try:
            writer = UnicodeWriter(outfile)
            writer.writerow(columns)
            for tweet in self:
                writer.writerow(self._make_row(tweet, columns))
        finally:
            outfile.close()
示例#2
0
def it_to_csv(it_of_its, outfile, mode="w"):
    """
    Export an iterable of iterables to a CSV. Row x Column.
    (eg: [["id", "score"], [1, 0.9], [2, 0.73], ...] will give
        id,score
        1,0.9
        2,0.73
    Can set 'mode' for appending ("a"). Default: write ("w")
    """
    with open(outfile, mode) as handle:
        writer = UnicodeWriter(handle)
        for row in it_of_its:
            writer.writerow(list(row))
示例#3
0
def it_to_csv(it_of_its, outfile, mode="w"):
    """
    Export an iterable of iterables to a CSV. Row x Column.
    (eg: [["id", "score"], [1, 0.9], [2, 0.73], ...] will give
        id,score
        1,0.9
        2,0.73
    Can set 'mode' for appending ("a"). Default: write ("w")
    """
    with open(outfile, mode) as handle:
        writer = UnicodeWriter(handle)
        for row in it_of_its:
            writer.writerow(list(row))
示例#4
0
    def dump_csv(self, filename, columns=DEFAULT_CSV_COLUMNS):
        """
        Dumps the matching tweets to a CSV file specified by `filename`.
        The default columns are ['id_str', 'user.screen_name', 'timestamp', 'text'].
        Columns are specified by their path in the tweet dictionary, so that
            'user.screen_name' will grab tweet['user']['screen_name']

        If `filename` ends with `.gz`, it will be a gzipped file. Else it will be a plaintext (utf8) file.
        Gzip typically achieves 3x-5x compression on this type of data, depending on the columns chosen and the structure of the data.

        Example:
        ########
        collection.since(one_hour_ago).dump_csv('my_tweets.csv', columns=['timestamp', 'text'])
        """
        if filename.endswith('.gz'):
            outfile = gzip.open(filename, 'w')
        else:
            outfile = open(filename, 'w')
        try:
            writer = UnicodeWriter(outfile)
            writer.writerow(columns)
            for tweet in self:
                writer.writerow(self._make_row(tweet, columns))
        finally:
            outfile.close()
示例#5
0
            text = line[TEXT_INDEX]
            clean = clean_text(text)

            tweet_texts.append(clean)
            tweet_labels.append(label)
            sys.stdout.write('+')
        sys.stdout.write('\n')

    dictionary = defaultdict(lambda: 0)
    for text in tweet_texts:
        for word in text.split():
            dictionary[word] += 1
        sys.stdout.write('.')

    with open(CLEAN_DATA_FILENAME, 'wb') as outfile:
        writer = UnicodeWriter(outfile)
        writer.writerow(['label', 'clean_text'])
        for label, text in zip(tweet_labels, tweet_texts):
            writer.writerow([label, text])
            sys.stdout.write('-')

    MIN_D = 20
    MAX_D = .5 * len(dictionary)

    with open(DICTIONARY_COUNT_FILENAME, 'wb') as outfile:
        writer = UnicodeWriter(outfile)
        writer.writerow(['word', 'count'])
        for word in sorted(dictionary.keys()):
            if dictionary[word] < MIN_D or dictionary[word] > MAX_D:
                continue
            writer.writerow([word, dictionary[word]])
            text  = line[TEXT_INDEX]
            clean = clean_text(text)

            tweet_texts.append(clean)
            tweet_labels.append(label)
            sys.stdout.write('+')
        sys.stdout.write('\n')

    dictionary = defaultdict(lambda: 0)
    for text in tweet_texts:
        for word in text.split():
            dictionary[word] += 1
        sys.stdout.write('.')

    with open(CLEAN_DATA_FILENAME, 'wb') as outfile:
        writer = UnicodeWriter(outfile)
        writer.writerow(['label', 'clean_text'])
        for label,text in zip(tweet_labels, tweet_texts):
            writer.writerow([label, text])
            sys.stdout.write('-')

    MIN_D = 20
    MAX_D = .5*len(dictionary)

    with open(DICTIONARY_COUNT_FILENAME, 'wb') as outfile:
        writer = UnicodeWriter(outfile)
        writer.writerow(['word', 'count'])
        for word in sorted(dictionary.keys()):
            if dictionary[word] < MIN_D or dictionary[word] > MAX_D:
                continue
            writer.writerow([word, dictionary[word]])