def findTags(file_path, pressnotes, cluster_number, language_code): language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} wordcount = WordCount(language_codes[language_code]) wordcount_dictionary = {} extras = {} for pressnote in pressnotes: wordcount.parse_text_extra(pressnote.title, wordcount_dictionary, extras) wordcount.parse_text_extra(pressnote.text, wordcount_dictionary, extras) sorted_wordcount = sorted(wordcount_dictionary.items(), key=operator.itemgetter(1), reverse=True) tags = [] for item in sorted_wordcount: #item[0] to stemming sorted_extras = sorted(extras[item[0]].items(), key=operator.itemgetter(1), reverse=True) for item in sorted_extras: tags.append(item[0]) break if len(tags) >= 10: break saveNotesToFile(file_path, pressnotes, cluster_number, tags)
def file_to_words(file): """ This tests that the file properly opens and returns a list of the words without spaces or new lines. """ wc = WordCount() return wc._file_reader(file)
def writers_words(list_of_files, outputfile): counter = WordCount() for file in list_of_files: counter.count_file(file) output = open(outputfile, 'w') stats = counter.word_stats() output.write("Total words counted: " + repr(counter.word_count_total()) \ + '\n') output.write('Rank'.rjust(RANK_IND) \ + 'Word'.rjust(WORD_IND) \ + 'Count'.rjust(COUNT_IND) \ + ' '*5 \ + 'Percentage'.ljust(PERC_IND) \ + '\n') rank = 1 for (count, perc, list_of_words) in stats: for word in list_of_words: newline = repr(rank).rjust(RANK_IND) \ + word.rjust(WORD_IND) \ + repr(count).rjust(COUNT_IND) \ + ' '*5 \ + repr(perc).ljust(PERC_IND) \ + '\n' output.write(newline) rank += 1 output.close() return True
def adding_words(list_of_words): wc = WordCount() manual_count = 0 for word in list_of_words: wc._add_to_count(word) manual_count += 1 assert wc._word_counts[word] > 0 assert wc._total_words == manual_count
def total_word_count(list_of_words): wc = WordCount() manual_count = 0 for word in list_of_words: wc._add_to_count(word) manual_count += 1 assert wc.word_count_total() == manual_count assert wc.word_count_total() == wc._total_words
def __init__(self, language_code): self.language_codes = { 'en': 'english', 'es': 'spanish', 'fr': 'french' } self.language_code = language_code self.wordcount = WordCount(self.language_codes[language_code]) self.wordcount_dictionary = {}
def count_file(filename): """ This test only works for my numeric-based test files. """ wc = WordCount() wc.count_file(filename) assert wc._total_words == 10 assert wc._word_counts['zero'] == 0 assert wc._word_counts['one'] == 1 assert wc._word_counts['two'] == 2 assert wc._word_counts['three'] == 3 assert wc._word_counts['four'] == 4
def __init__(self, language_code, dictionary_path, dir_notes, dict_max_size=None): #wersja klasterujaca wszystkie pliki self.bag_of_words = {} self.language_code = language_code language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} self.wordcount = WordCount(language_codes[language_code]) self.word_indexes = self.load_dictionary(dictionary_path, dict_max_size) self.create(dir_notes)
def __init__(self, language_code, output_dir, dir_notes, dict_max_size=None ): #wersja klastrujaca notki z kazdego pliku z osobna self.bag_of_words = {} self.language_code = language_code language_codes = {'en': 'english', 'es': 'spanish', 'fr': 'french'} self.wordcount = WordCount(language_codes[language_code]) self.dict_max_size = dict_max_size self.create2(dir_notes, output_dir)
def count_mult_files(list_of_filenames): wc = WordCount() mult = 1 for file in list_of_filenames: wc.count_file(file) assert wc._total_words == 10*mult assert wc._word_counts['zero'] == 0 assert wc._word_counts['one'] == 1*mult assert wc._word_counts['two'] == 2*mult assert wc._word_counts['three'] == 3*mult assert wc._word_counts['four'] == 4*mult mult += 1
def main(): print 'cleaning data.' data = pd.read_csv('../output/twitterDB_all.csv', header=None) # read data data.columns = ['tweet', 'city'] data_clean = data.dropna() # drop na print 'sentiment analysis.' data_clean.loc[:, 'senti_score'] = np.nan regex = '(\shttp[s]:\\\\)' data_clean.loc[:, 'tweet_content'] = data_clean.tweet \ .apply(lambda x: re.split(regex, x)[0]) regex2 = '\s@.+\:\s' data_clean.loc[:, 'tweet_content'] = data_clean.tweet_content \ .apply(lambda x: re.split(regex2, x)[-1]) # sentimental analysis data_clean.loc[:, 'senti_score'] = data_clean.tweet_content \ .apply(lambda x: SentiAnalyze(x)) data_city = data_clean[['city', 'senti_score', 'tweet_content']] data_city.reset_index(drop=True, inplace=True) # geocode the country name print 'convert city to country.' data_city.loc[:, 'country'] = np.nan city_names = data_clean.city.unique() city_country = {} print 'call google api' for city in city_names: city_country[city] = CountryToCity(city) print 'city country matching.' def f(x): if x in city_country.keys(): return city_country[x] else: return x data_city['country'] = data_city.city.apply(f) data_country = data_city[['country', 'senti_score', 'tweet_content']] print 'save the dataframe with sentimental score.' data_country.to_csv('../output/{0}.csv'.format(raw_input('File Name:\n'))) # word count print 'word count.' count = WordCount(data_country, 'country', 'tweet_content') print 'save the word count pickle file' filename = raw_input('WordCount Name:\n') with open('../output/{0}.pkl'.format(filename), 'w') as fh: pickle.dump(count, fh)
def proper_reset(num, list_of_words): wc1 = WordCount() wc2 = WordCount() # increase wc1 wc1._total_words += num for word in list_of_words: wc1._word_counts[word] += 1 # check that they are now different if num > 0: assert wc1._total_words > wc2._total_words if len(list_of_words) > 0: assert len(wc1._word_counts.items()) > len(wc2._word_counts.items()) #reset wc1.reset() # check that wc1 has indeed reset assert wc1._total_words == wc2._total_words assert wc1._total_words == 0 assert len(wc1._word_counts.items()) == len(wc2._word_counts.items()) assert len(wc1._word_counts.items()) == 0
def testWordCount(self): wc = WordCount() sc = wc.getSparkContext("WordCountTest", "local[*]") input = ["Apache Spark is a fast and general engine for large-scale data processing.", "Spark runs on both Windows and UNIX-like systems"] inputRDD = sc.parallelize(input) resultRDD = wc.process(inputRDD) resultMap = resultRDD.collectAsMap() self.assertEqual(resultMap['Spark'], 2) self.assertEqual(resultMap['UNIX-like'], 1) self.assertEqual(resultMap['runs'], 1) print(resultMap) sc.stop()
def _get_top_words(self, content, n): """Return top n links from content.""" left = [ m.start() for m in re.finditer(re.escape(self.LEFT_BRACKET), content) ] right = [ m.start() for m in re.finditer(re.escape(self.RIGHT_BRACKET), content) ] wc = WordCount() for i in range(0, len(left)): wc.add(content[left[i] + len(self.LEFT_BRACKET):right[i]]) return [key[0] for key in wc.top(n)]
def handle_data(data_list, shrunk_line_list, global_list): print('still going' + str(time.time())) sorted_counts = { k: v for k, v in sorted( data_list.items(), key=lambda item: item[1], reverse=True) } shrunk_list = {} for key in sorted_counts: if key in global_list: global_list[key].addCount(sorted_counts[key]) global_list[key].incrementPageCount() else: global_list[key] = WordCount(key, sorted_counts[key]) global_list = { k: v for k, v in sorted(global_list.items(), key=lambda item: item[1].count, reverse=True) } global_word_count_list = list(global_list) X = 5 in_top_X = False for x in range(min(X, len(global_list))): if key == global_word_count_list[x]: in_top_X = True if not (in_top_X): shrunk_list[key] = sorted_counts[key] shrunk_list = { k: v for k, v in sorted( shrunk_list.items(), key=lambda item: item[1], reverse=True) } shrunk_line_list.append(shrunk_list.copy()) retVal = list() retVal.append(shrunk_line_list) retVal.append(global_list) return retVal
from datetime import datetime import csv from functools import reduce from wordcount import WordCount kms = boto3.client('kms') dynamodb = boto3.client('dynamodb') logs = boto3.client('logs') if 'SECRETS' in os.environ: SECRETS = json.loads(kms.decrypt( CiphertextBlob=base64.b64decode(os.environ['SECRETS']) )['Plaintext'].decode("utf-8")) wc = WordCount() def update_counter(word, book, n=1): response = dynamodb.update_item( TableName='words', Key={ 'Book': {'S': book}, 'BookWord': {'S': word} }, UpdateExpression='SET wordCount = if_not_exists(wordCount, :init) + :inc', ExpressionAttributeValues={ ':inc': {'N': str(n)}, ':init': {'N': '0'}, }, ReturnValues="UPDATED_NEW"
def clean_words(list): wc = WordCount() for i in range(len(list)): list[i] = wc._word_cleaner(list[i]) return list
def words_stats(filename): wc = WordCount() wc.count_file(filename) return wc.word_stats()
def perc_words(filename): wc = WordCount() wc.count_file(filename) return wc.words_percent()
def ranked_words(filename): wc = WordCount() wc.count_file(filename) return wc.words_ranked()
def alpha_words(filename): wc = WordCount() wc.count_file(filename) return wc.words_alphabetical()
def unique_word_count(list_of_words): wc = WordCount() for word in list_of_words: wc._add_to_count(word) return wc.word_count_unique()
def test_wordcount(self): self.job = WordCount().run(input=[chekhov]) self.assertEquals(len(list(self.results(self.job))), 12339)