def getWordsMostFreqByAuthor(authorName, wordNumber = 10): ''' Get the keyword's stats for a given author. :param authorName: The asked author's name. :param wordNumber: the max number of results. :returns: a dictionnary of [word=>count]. example of use: data["lol"] = 42''' author = Author.all() author.filter('name = ', authorName) artiAuths = ArtiAuth.all() artiAuths.filter('keyAuthor = ', author.get()) data = collections.defaultdict(list) for article in artiAuths : mapReduces = MapReduce.all() mapReduces.filter('keyArticle = ', article.keyArticle) for mR in mapReduces: try: if data[mR.keyWord]: data[mR.keyWord] += mR.count else: data[mR.keyWord] = mR.count except ReferencePropertyResolveError : print 'Pas de reference word' # Sort the results based on the sum of each word's occurences sortedList = data.items() sortedList.sort(key=lambda x: x[1], reverse=True) return OrderedDict(sortedList[0:wordNumber-1])
def saveMapReduce(namefic): """ Converted the pdf file in text Do the mapper and reduce in the text Get the references cited in the article Save data Author, Article, ArtCitedBib, MapReduce, Master :param namefic : the name file """ # save_pdf(namefic) fic = convert_pdf_to_txt(namefic) dataDict = mapper(fic) dataDict = reducer(dataDict) lines = re.split(r"\n", fic) authorStr = re.sub(r"[^a-zA-Z\s]", " ", lines[4]) author = Author(name=authorStr) author.put() titre = re.sub(r"[^a-zA-Z\s]", " ", lines[0] + lines[1]) titre = titre.strip() article = Article(name=titre, fileName=namefic) article.put() getReferences(fic, article) artiAuth = ArtiAuth(keyAuthor=author, keyArticle=article) artiAuth.put() for cle in dataDict.keys(): mapReduce = MapReduce(keyWord=cle, keyArticle=article, count=dataDict[cle]) mapReduce.put() checkMaster = Master.all() checkMaster.filter("keyWord =", cle) if checkMaster.count() > 0: master = checkMaster.get() master.count = master.count + dataDict[cle] else: master = Master(keyWord=cle, count=dataDict[cle]) master.put()
def getArticleByAuthor (authorName, artNumber = 10) : '''Get the articles from a given author. :param authorName: a string which is the author's name. :param artNumber: the max count of results to give in the returned list. :returns: a list of string being the article's names.''' author = Author.all() author.filter('name = ', authorName) artiAuths = ArtiAuth.all() artiAuths.filter('keyAuthor = ', author.get()) data = [] results = artiAuths.fetch(limit=artNumber) for art in results: data.append(art.keyArticle.name) return data