def get_all_counts_and_lengths(counts_file, lengths_file):
    #counts = None
    if not os.path.exists(counts_file) or not os.path.exists(lengths_file):
        print 'Counting all words in all descriptions.'
        #counts = count_all_words(DATA_FILE, counts_file)

        counts = defaultdict(int)
        lengths = defaultdict(int)
        data = datagetter.get_data(DATA_FILE)
        start = time.time()
        for ii, text in enumerate(data['FullDescription']):
            sentences = nltk.tokenize.sent_tokenize(text)
            total_words = 0 # Keeps track of total words in each ad
            for sentence in sentences:
                words = nltk.tokenize.word_tokenize(sentence)
                total_words += len(words)
                for word in words:
                    counts[word] += 1 # Track number of instances of a word
            # Track number of sentences and total number of words per ad
            lengths[ii] = len(sentences), total_words
            if not ii % 1000: 
                print 'Finished {:d} ads in {:.2f} seconds'.format(ii, time.time() - start)
        pickle.dump(counts, open(counts_file, "wb"))
        pickle.dump(lengths, open(lengths_file, "wb"))
    else:
        print 'Reading counts and lengths file.'
        counts = datagetter.read_file(counts_file)
        lengths = datagetter.read_file(lengths_file)
    return counts, lengths 
#################################################################################
# Script
#################################################################################
if __name__ == '__main__':
    counts, lengths = get_all_counts_and_lengths('temp/total_word_counts.p', 'temp/sentence_lengths.p')
    if not counts:
        sys.stderr.write('Something went wrong reading the counts file.')
    sorted_counts = sorted(counts.iteritems(), key=operator.itemgetter(1), reverse=True)
    #for ii, pair in enumerate(sorted_counts[:100]):
    #    print '{0}: {1} - {2}'.format(ii, pair[0], pair[1])
    
    if not lengths:
         sys.stderr.write('Something went wrong reading the lengths file.')
#    print lengths

    data = datagetter.get_data()
    feats = get_word_count_feats(data, [word for word, val in sorted_counts[:500]], 'top500')

    
    print feats
    
    ''' 
    data = datagetter.get_data()
    add_feats(data, sorted_counts)

    pickle.dump(data, open('annotated_data.p', 'wb'))

    print data

<<<<<<< HEAD
#    plt.bar([ii for ii in range(100)], [val[1] for val in sorted_counts[:100]])