def get_music_bio(params): files = pd.read_csv(params['csv_path']).to_dict('resutls') params['D'] = params['D'] if params['D'] else len(files) all_word_counts = {} for f in files[:params['D']]: line = f['content'] s = remove_punctuation(line).lower().split() if len(s) > 1: for word in s: if word not in all_word_counts: all_word_counts[word] = 0 else: all_word_counts[word] += 1 params['V'] = params['V'] if params['V'] else len(all_word_counts) V = min(params['V'], len(all_word_counts)) all_word_counts_idx = all_word_counts all_word_counts = sorted(all_word_counts.items(), key=lambda x: x[1], reverse=True) top_words = [w for w, count in all_word_counts[:V-1]] + ['<UNK>'] word2idx = {w:i for i, w in enumerate(top_words)} all_word_counts_idx = {ind: all_word_counts_idx[w] if w != '<UNK>' else 0 for ind, w in enumerate(word2idx)} print("finished counting") unk = word2idx['<UNK>'] sents = [] sentences = [] for f in files[:params['D']]: content = f['content'] for sentence in content.split("."): sentence = remove_punctuation(sentence).lower() if len(sentence.split()) > 1: sent = [word2idx[w] if w in word2idx and w != ' ' else unk for w in sentence.split()] sentences.append(sentence) sents.append(sent) return sentences, sents, word2idx, all_word_counts_idx, params
def main(): #This file contains task 1.1 - 1.6 f = codecs.open(text_file, "r", "utf-8") paragraphs = functions.makeParagraphArray(f) #Removes "gutenberg" and makes a copy of the paragraph paragraphs = functions.remove_specific_word("Gutenberg", paragraphs) paragraphs = functions.remove_specific_word("gutenberg", paragraphs) par_copy = copy.copy(paragraphs) paragraphs = functions.tokenize(paragraphs) paragraphs = functions.remove_punctuation(paragraphs) paragraphs = functions.stem(paragraphs) return par_copy, paragraphs
def main(sc, argv): filename = argv[1] # threshold = int(argv[2]) dfTextFile = sc.read.text(filename) wordCount = dfTextFile \ .select(explode(split(dfTextFile.value, ' ')).alias('word')) \ .transform(udfStr.remove_punctuation('word')) \ .groupBy('word') \ .count() \ .collect() print('-' * 50) # wordCount.select('word').show() for w in sorted(wordCount, key=lambda x: x[1]): print(w) print('-' * 50)
def remove_test(): assert remove_punctuation('!!!Hello!@#?') == 'Hello'
def test_remove_punctuation(): assert callable(remove_punctuation) assert remove_punctuation("hEllO,hOware!yOU") == "hellohowareyou"