예제 #1
0
def mapfn(key, value):
    w={}
    for line in value.splitlines():
        sect = line.split(':::')
        title = sect[-1]
        for author in sect[-2].split('::'):
            auth = author.lower()
            if auth in w.keys():
                for word in title.split():
                    wrd = word.translate(None, string.punctuation).lower()
                    if wrd in allStopWords.keys():
                        pass
                    else:
                        if wrd in w[auth].keys():
                            w[auth][wrd] += 1
                        else:
                            w[auth][wrd] = 1
            else:
                w[auth] = {}
                for word in title.split():
                    if word in allStopWords.keys():
                        pass
                    else:
                        w[auth][word.translate(None, string.punctuation).lower()] = 1

    for k in w.keys():
        yield k, w[k]
예제 #2
0
def cleanTitle(row_title):
    titles = row_title.split(" ")
    for index, title in enumerate(titles):
        titles[index] = re.sub('[^A-Za-z0-9]+', '', title)
    for word in allStopWords.keys():
        if word in titles:
            titles.remove(word)
    return titles
def mapfn(k, v):
    from stopwords import allStopWords as stopwords 
    stopwords = stopwords.keys()
    publication, authors, title = v.split(':::')
    for author in authors.split('::'):
      for word in title.split(' '):
        if (word not in stopwords and len(word) > 1):
          yield (author, filter(str.isalnum, word)), 1
def reducefn(k, vs):#for each author
    from stopwords import allStopWords
    result=" ".join(vs)
    words=result.lower().split()
    cleanTitle=[]
    for w in words:
        if w not in allStopWords.keys():
            cleanTitle.append(w)
    for w in cleanTitle:
        return w,cleanTitle.count(w)
예제 #5
0
def mapfn(k, v: str):
    import string
    from stopwords import allStopWords
    stopwords = allStopWords.keys()
    trans = str.maketrans('', '', string.punctuation)
    for w in v.split():
        w = w.lower()
        w = w.translate(trans)
        if w in stopwords or len(w) <= 1:
            continue
        yield w, 1
예제 #6
0
def mapfn(k, v):
    print("processando map:{}".format(k))
    from stopwords import allStopWords

    for line in v.split('\n'):
        if line:
            fields = line.split(':::')
            authors = fields[1].split('::')
            words = fields[2].split(' ')
            for author in authors:
                for word in words:
                    if word not in allStopWords.keys():
                        yield (author, word)
def mapfn(k, v): #for each line
    from stopwords import allStopWords
    import string
    import re
    for line in v.splitlines():
        lineSplit=line.split(":::")
        title=lineSplit[2]
        words=title.lower().split()
        authors=lineSplit[1].split("::")
        for author in authors:
            for word in words:
                if len(word)>1:
                    if word not in allStopWords.keys():
                        word=re.sub("-"," ",word)
                        word = word.translate(None, string.punctuation)
                        yield author,word
def mapfn(file_name, file_contents):
    """Map Function

    """
    import string
    from stopwords import allStopWords

    exclude = set(string.punctuation)
    result = {}
    for line in file_contents.splitlines():
        line_contents = line.split(':::')
        authors = line_contents[1]
        words = line_contents[2]

        for author in authors.split('::'):
            lauthor = author.lower()
            if lauthor not in result.keys():
                result[lauthor] = {}
            for word in words.split(' '):
                # Validate and sanitize the word
                vword = word.lower()
                if vword in allStopWords.keys():
                    # Stopwords should be ignored
                    continue
                if len(vword) == 1:
                    # Single letter words should be ignored
                    continue

                # Ignore punctuation characters from words
                vword = ''.join(ch for ch in vword if ch not in exclude)

                # Finally, add word in result
                if vword in result[lauthor].keys():
                    result[lauthor][vword] += 1
                else:
                    result[lauthor][vword] = 1

    for author in result.keys():
        yield author, result[author]