def mapfn(key, value): w={} for line in value.splitlines(): sect = line.split(':::') title = sect[-1] for author in sect[-2].split('::'): auth = author.lower() if auth in w.keys(): for word in title.split(): wrd = word.translate(None, string.punctuation).lower() if wrd in allStopWords.keys(): pass else: if wrd in w[auth].keys(): w[auth][wrd] += 1 else: w[auth][wrd] = 1 else: w[auth] = {} for word in title.split(): if word in allStopWords.keys(): pass else: w[auth][word.translate(None, string.punctuation).lower()] = 1 for k in w.keys(): yield k, w[k]
def cleanTitle(row_title): titles = row_title.split(" ") for index, title in enumerate(titles): titles[index] = re.sub('[^A-Za-z0-9]+', '', title) for word in allStopWords.keys(): if word in titles: titles.remove(word) return titles
def mapfn(k, v): from stopwords import allStopWords as stopwords stopwords = stopwords.keys() publication, authors, title = v.split(':::') for author in authors.split('::'): for word in title.split(' '): if (word not in stopwords and len(word) > 1): yield (author, filter(str.isalnum, word)), 1
def reducefn(k, vs):#for each author from stopwords import allStopWords result=" ".join(vs) words=result.lower().split() cleanTitle=[] for w in words: if w not in allStopWords.keys(): cleanTitle.append(w) for w in cleanTitle: return w,cleanTitle.count(w)
def mapfn(k, v: str): import string from stopwords import allStopWords stopwords = allStopWords.keys() trans = str.maketrans('', '', string.punctuation) for w in v.split(): w = w.lower() w = w.translate(trans) if w in stopwords or len(w) <= 1: continue yield w, 1
def mapfn(k, v): print("processando map:{}".format(k)) from stopwords import allStopWords for line in v.split('\n'): if line: fields = line.split(':::') authors = fields[1].split('::') words = fields[2].split(' ') for author in authors: for word in words: if word not in allStopWords.keys(): yield (author, word)
def mapfn(k, v): #for each line from stopwords import allStopWords import string import re for line in v.splitlines(): lineSplit=line.split(":::") title=lineSplit[2] words=title.lower().split() authors=lineSplit[1].split("::") for author in authors: for word in words: if len(word)>1: if word not in allStopWords.keys(): word=re.sub("-"," ",word) word = word.translate(None, string.punctuation) yield author,word
def mapfn(file_name, file_contents): """Map Function """ import string from stopwords import allStopWords exclude = set(string.punctuation) result = {} for line in file_contents.splitlines(): line_contents = line.split(':::') authors = line_contents[1] words = line_contents[2] for author in authors.split('::'): lauthor = author.lower() if lauthor not in result.keys(): result[lauthor] = {} for word in words.split(' '): # Validate and sanitize the word vword = word.lower() if vword in allStopWords.keys(): # Stopwords should be ignored continue if len(vword) == 1: # Single letter words should be ignored continue # Ignore punctuation characters from words vword = ''.join(ch for ch in vword if ch not in exclude) # Finally, add word in result if vword in result[lauthor].keys(): result[lauthor][vword] += 1 else: result[lauthor][vword] = 1 for author in result.keys(): yield author, result[author]