class PredictorFuncs: def __init__(self): ''' Initialise mongodb connection ''' self.mongo = Mongo() def removeStopWords(self, splitText): modified_stopwords = stopwords.words('english') modified_stopwords.extend(('[...]','.read','read','more…', '…','more...','more.read')) filtered_words = [ w for w in splitText if not w in modified_stopwords] return filtered_words def stemWords(self, sent, rmStopWords=True): sent = sent.split() if(rmStopWords == True): sent = self.removeStopWords(sent) retSent = [] for word in sent: retSent.append(WordNetLemmatizer().lemmatize(word, 'v')) sent = " ".join(retSent) return sent def processAllExistingFeeds(self): allFeeds = self.mongo.selectUnProcessedFeeds() for entry in allFeeds: depValues = self.classify(entry['feed']) logger.info('control back in processfn') if depValues != 0: self.mongo.updateDepValues(entry['_id'], depValues) def calculateWeight(self, wordsInDepList, sentence, index): depList = [] tempWts = {} try: if index >= 2: sentence[index - 2].replace('.', '') sentence[index - 2].replace(',', '') if sentence[index - 2].isalnum(): depList.append(wordsInDepList.get(sentence[index - 2], 0)) else: depList.append(0) except IndexError: depList.append(0) try: if index >= 1: sentence[index - 1].replace('.', '') sentence[index - 1].replace(',', '') if sentence[index - 1].isalnum(): depList.append(wordsInDepList.get(sentence[index - 1], 0)) else: depList.append(0) except IndexError: depList.append(0) try: sentence[index + 1].replace('.', '') sentence[index + 1].replace(',', '') if sentence[index + 1].isalnum(): depList.append(wordsInDepList.get(sentence[index + 1], 0)) except IndexError: depList.append(0) try: sentence[index + 2].replace('.', '') sentence[index + 2].replace(',', '') if sentence[index + 2].isalnum(): depList.append(wordsInDepList.get(sentence[index + 2], 0)) except IndexError: depList.append(0) for entry in depList: if (entry != 0): for item in entry: try: tempWts[item['category']] += item['value'] except KeyError: tempWts[item['category']] = item['value'] return tempWts def addToDepList(self, wordsInDepList, depValues, sentList): for sentence in sentList: sentence = sentence.split() for index, word in enumerate(sentence): tempWts = self.calculateWeight(wordsInDepList, sentence, index) if tempWts: normFactor = max(tempWts.values()) normFactor = ceil(normFactor) for item in tempWts: category = item value = tempWts[item] value = value / normFactor if value > 1: assert False #logger.info(word + ' ' + category + ' ' + str(value)) try: depentry = DepWords.objects.get( word=word, category=category) oldValue = depentry.value value = 0.16 * value + 0.84 * oldValue if(value > 1): value = value / normFactor depentry.save() except DepWords.DoesNotExist: depentry = DepWords( word=word, value=value, samples=-1, category=category) depentry.save() def classify(self, feed): title = feed['title'] try: content = feed['summary_detail']['value'] except: content = feed['summary'] tags = [] try: temp = feed['tags'] for tag in temp: tags.append(tag['term']) except KeyError: pass soup = BeautifulSoup(content) text = soup.getText() text = text.lower() text = text.replace('.',' . ') spChars = '~`!@#$%^&*()_-—+=[]{}|:?;"\'\\/>,<“”’‘»…' #all special char except '.' text = ''.join(c for c in text if c not in spChars) text = self.stemWords(text) sentList = text.split('.') depValues = { "automobile": 0.0, "bussiness": 0.0, "fashion": 0.0, "food": 0.0, "health": 0.0, "history": 0.0, "movie": 0.0, "music": 0.0, "real-estate": 0.0, "science": 0.0, "sports": 0.0, "technology": 0.0, "travel": 0.0 } wordToAddInDepList = {} for sentence in sentList: sentence = sentence.split() for word in sentence: depEntries = DepWords.objects.filter(word=word) if depEntries: for entry in depEntries: depValues[entry.category] = depValues.get( entry.category, 0) + entry.value # Calculate new dependancy values try: wordToAddInDepList[entry.word].append( {'category': entry.category, 'value': entry.value}) except KeyError: wordToAddInDepList[entry.word] = [] wordToAddInDepList[entry.word].append( {'category': entry.category, 'value': entry.value}) # normalize depValues normFactor = max(depValues.values()) normFactor = ceil(normFactor) if normFactor == 0: return 0 for entry in depValues: depValues[entry] = depValues[entry] / normFactor self.addToDepList( wordToAddInDepList, depValues, sentList) return depValues def euclideanDist(self, userVals , postVals): distSqare = 0 for entry in userVals: distSqare += (userVals[entry]-postVals[entry])**2 dist = sqrt(distSqare) normalizedDist = dist / sqrt(len(userVals)) return normalizedDist def calculateUserPostDist(self, user_id): user = self.mongo.selectUser(user_id) user_dep = user.get('depValues') processedFeeds = self.mongo.selectProcessedFeeds(user_id) for feed in processedFeeds: feed_dep = feed.get('depValues') prefValue = self.euclideanDist(user_dep, feed_dep) pref = {"user_id" : user_id , "value" : prefValue} feed['pref'][str(user_id)] = prefValue self.mongo.updateUserPref(feed['_id'], feed['pref'])
def __init__(self): ''' Initialise mongodb connection ''' self.mongo = Mongo()
class ParsingFuncs: ''' Contains all functions to get and parse the feeds ''' def __init__(self): ''' Initialise mongodb connection ''' self.mongo = Mongo() def fetchFeeds(self): ''' Fetches all the entries in the table siteInfo and fetches its feeds .. Stores to the database ''' siteList = SiteInfo.objects.all() for site in siteList: modifiedStr = self.createLastModifiedStr(site.lastModified, site.etag) if modifiedStr is not None: feeds = feedparser.parse(site.feedUrl, modifiedStr) else: feeds = feedparser.parse(site.feedUrl) # find the last modified date. This value will be in feed.updated, feed.last_modified lastModified = self.findLastModifiedDate(feeds.feed) try: etag = feeds.feed.etag except: etag = None feedsHash = self.md5Feeds(feeds) # calculating the hash of entire feeds if(site.feedHash == feedsHash): continue # if no change in feeds ignore it. site.feedHash = feedsHash # if changed save it in db if etag is None: site.lastModified = lastModified else: site.etag = etag site.save() for entry in feeds.entries: dt = datetime.fromtimestamp(mktime(entry.published_parsed)) # the format of published_parsed is not.. entry['published_parsed'] = dt # compactible with mongodb try: mediaContnet = entry['media_content'] except: mediaContnet = None try: content = entry['content'][0]['value'] except: content = None entry['image_link'] = self.getImage(media_content=mediaContnet, summary=entry['summary'], content=content, link=entry['link']) self.mongo.insertFeeds(entry, site.id) def allFeeds(self,user_id, lastDate=None): if lastDate is not None: return self.mongo.selectFeeds(user_id=user_id,dateOfLastItem=lastDate) return self.mongo.selectFeeds(user_id=user_id) def md5Feeds(self, feed): ''' find md5 of feed ''' md5 = hashlib.md5(str(feed).encode('utf-8')) return md5.hexdigest() def selectFeedById(self, id): return self.mongo.selectFeedById(id) def getSiteTitle(self, siteId): siteObject = SiteInfo.objects.filter(id=siteId) for site in siteObject: return site.title def getSummary(self, summary): summary1000wds = strip_tags(summary) summary1000wds = summary1000wds[:300] + "..." return summary1000wds def getFullPost(self, summaryDetail): post = strip_tags(summaryDetail) return post def createLastModifiedStr(self, last_modified=None, etag=None): modiStr = None if etag is not None: modiStr = "etag = " + str(etag) if last_modified is not None: modiStr = "modified = " + str(last_modified.utctimetuple()) return modiStr def findLastModifiedDate(self, feed): try: last_modified = datetime.fromtimestamp(mktime(feed.updated_parsed)) # if updated date is present except: try: last_modified = datetime.fromtimestamp(mktime(feed.date_parsed)) # if date field is present except: try: # if published_parsed is present last_modified = datetime.fromtimestamp(mktime(feed.published_parsed)) except: structTime = time.localtime() last_modified = datetime(*structTime[:6]) return last_modified def getFullPostURLOpen(self, link, summary): http = urllib3.PoolManager() #req = Request(link, headers={'User-Agent': "ireadr"}) try: page = http.request('GET', link) page = page.data #page = urlopen(req) except: page = None if page is not None: soup = BeautifulSoup(page) summary = summary[:25] # modify this like check match for entire summary. #if not found find a substring of length 50 and check again # if again not found then reduce the length and try again element = soup.find(text=re.compile(summary)) post = element.findParent('div') return post return None def findImgsrcFromHtml(self, content): soup = BeautifulSoup(content) img_links = soup.findAll('img') if len(img_links) > 0: for link in img_links: try: if link['height'] == '1' or link['width'] == '1': continue else: return (link['src']) except: return (link['src']) return None def getImage(self, media_content=None, summary=None, content=None, link=None): if media_content is not None: return (media_content[0]['url']) # if media_content is None if summary is not None: return (self.findImgsrcFromHtml(summary)) # if no matching image is found in summary if content is not None: return (self.findImgsrcFromHtml(content)) # if image is not found in content then fetch the original page and extract the image if link is not None: post = self.getFullPostURLOpen(link, summary) if post is not None: return(self.findImgsrcFromHtml(post)) return None