Exemplo n.º 1
0
class PredictorFuncs:

    def __init__(self):
        '''
        Initialise mongodb connection
        '''
        self.mongo = Mongo()

    def removeStopWords(self, splitText):
        modified_stopwords = stopwords.words('english')
        modified_stopwords.extend(('[...]','.read','read','more…', '…','more...','more.read'))
        filtered_words = [
            w for w in splitText if not w in modified_stopwords]
        return filtered_words

    def stemWords(self, sent, rmStopWords=True):
        sent = sent.split()
        if(rmStopWords == True):
            sent = self.removeStopWords(sent)
        retSent = []
        for word in sent:
            retSent.append(WordNetLemmatizer().lemmatize(word, 'v'))
        sent = " ".join(retSent)
        return sent

    def processAllExistingFeeds(self):
        allFeeds = self.mongo.selectUnProcessedFeeds()
        for entry in allFeeds:
            depValues = self.classify(entry['feed'])
            logger.info('control back in processfn')
            if depValues != 0:
                self.mongo.updateDepValues(entry['_id'], depValues)

    def calculateWeight(self, wordsInDepList, sentence, index):
        depList = []
        tempWts = {}
        try:
            if index >= 2:
                sentence[index - 2].replace('.', '')
                sentence[index - 2].replace(',', '')
                if sentence[index - 2].isalnum():
                    depList.append(wordsInDepList.get(sentence[index - 2], 0))
            else:
                depList.append(0)
        except IndexError:
            depList.append(0)
        try:
            if index >= 1:
                sentence[index - 1].replace('.', '')
                sentence[index - 1].replace(',', '')
                if sentence[index - 1].isalnum():
                    depList.append(wordsInDepList.get(sentence[index - 1], 0))
            else:
                depList.append(0)
        except IndexError:
            depList.append(0)
        try:
            sentence[index + 1].replace('.', '')
            sentence[index + 1].replace(',', '')
            if sentence[index + 1].isalnum():
                depList.append(wordsInDepList.get(sentence[index + 1], 0))
        except IndexError:
            depList.append(0)
        try:
            sentence[index + 2].replace('.', '')
            sentence[index + 2].replace(',', '')
            if sentence[index + 2].isalnum():
                depList.append(wordsInDepList.get(sentence[index + 2], 0))
        except IndexError:
            depList.append(0)
        for entry in depList:
            if (entry != 0):
                for item in entry:
                    try:
                        tempWts[item['category']] += item['value']
                    except KeyError:
                        tempWts[item['category']] = item['value']
        return tempWts

    def addToDepList(self, wordsInDepList, depValues, sentList):
        for sentence in sentList:
            sentence = sentence.split()
            for index, word in enumerate(sentence):
                tempWts = self.calculateWeight(wordsInDepList, sentence, index)
                if tempWts:
                    normFactor = max(tempWts.values())
                    normFactor = ceil(normFactor)
                for item in tempWts:
                    category = item
                    value = tempWts[item]
                    value = value / normFactor
                    if value > 1:
                        assert False
                    #logger.info(word + ' ' + category + ' ' + str(value))
                    try:
                        depentry = DepWords.objects.get(
                            word=word, category=category)
                        oldValue = depentry.value
                        value = 0.16 * value + 0.84 * oldValue
                        if(value > 1):
                            value = value / normFactor
                        depentry.save()
                    except DepWords.DoesNotExist:
                        depentry = DepWords(
                            word=word, value=value, samples=-1, category=category)
                        depentry.save()

    def classify(self, feed):
        title = feed['title']
        try:
            content = feed['summary_detail']['value']
        except:
            content = feed['summary']
        tags = []
        try:
            temp = feed['tags']
            for tag in temp:
                tags.append(tag['term'])
        except KeyError:
            pass
        soup = BeautifulSoup(content)
        text = soup.getText()
        text = text.lower()
        text = text.replace('.',' . ')
        spChars = '~`!@#$%^&*()_-—+=[]{}|:?;"\'\\/>,<“”’‘»…' #all special char except '.'
        text = ''.join(c for c in text if c  not in spChars)
        text = self.stemWords(text)
        sentList = text.split('.')
        depValues = {
            "automobile": 0.0,
            "bussiness": 0.0,
            "fashion": 0.0,
            "food": 0.0,
            "health": 0.0,
            "history": 0.0,
            "movie": 0.0,
            "music": 0.0,
            "real-estate": 0.0,
            "science": 0.0,
            "sports": 0.0,
            "technology": 0.0,
            "travel": 0.0
        }
        wordToAddInDepList = {}
        for sentence in sentList:
            sentence = sentence.split()
            for word in sentence:
                depEntries = DepWords.objects.filter(word=word)
                if depEntries:
                    for entry in depEntries:
                        depValues[entry.category] = depValues.get(
                            entry.category, 0) + entry.value
                        # Calculate new dependancy values
                        try:
                            wordToAddInDepList[entry.word].append(
                                {'category': entry.category, 'value': entry.value})
                        except KeyError:
                            wordToAddInDepList[entry.word] = []
                            wordToAddInDepList[entry.word].append(
                                {'category': entry.category, 'value': entry.value})
        # normalize depValues
        normFactor = max(depValues.values())
        normFactor = ceil(normFactor)
        if normFactor == 0:
            return 0
        for entry in depValues:
            depValues[entry] = depValues[entry] / normFactor
        self.addToDepList(
            wordToAddInDepList, depValues, sentList)
        return depValues

    def euclideanDist(self, userVals , postVals):
        distSqare = 0
        for entry in userVals:
            distSqare += (userVals[entry]-postVals[entry])**2
        dist = sqrt(distSqare)
        normalizedDist = dist / sqrt(len(userVals))
        return normalizedDist

    def calculateUserPostDist(self, user_id):
        user = self.mongo.selectUser(user_id)
        user_dep = user.get('depValues')
        processedFeeds = self.mongo.selectProcessedFeeds(user_id)
        for feed in processedFeeds:
            feed_dep = feed.get('depValues')
            prefValue = self.euclideanDist(user_dep, feed_dep)
            pref = {"user_id" : user_id , "value" : prefValue}
            feed['pref'][str(user_id)] = prefValue
            self.mongo.updateUserPref(feed['_id'], feed['pref'])
Exemplo n.º 2
0
 def __init__(self):
     '''
     Initialise mongodb connection
     '''
     self.mongo = Mongo()
Exemplo n.º 3
0
class ParsingFuncs:

    '''
    Contains all functions to get and parse the feeds
    '''

    def __init__(self):
        '''
        Initialise mongodb connection
        '''
        self.mongo = Mongo()

    def fetchFeeds(self):
        '''
        Fetches all the entries in the table siteInfo and fetches its feeds ..
        Stores to the database
        '''
        siteList = SiteInfo.objects.all()
        for site in siteList:
            modifiedStr = self.createLastModifiedStr(site.lastModified, site.etag)
            if modifiedStr is not None:
                feeds = feedparser.parse(site.feedUrl, modifiedStr)
            else:
                feeds = feedparser.parse(site.feedUrl)
            # find the last modified date. This value will be in feed.updated, feed.last_modified
            lastModified = self.findLastModifiedDate(feeds.feed)
            try:
                etag = feeds.feed.etag
            except:
                etag = None

            feedsHash = self.md5Feeds(feeds)  # calculating the hash of entire feeds
            if(site.feedHash == feedsHash):
                continue  # if no change in feeds ignore it.
            site.feedHash = feedsHash  # if changed save it in db
            if etag is None:
                site.lastModified = lastModified
            else:
                site.etag = etag
            site.save()
            for entry in feeds.entries:
                dt = datetime.fromtimestamp(mktime(entry.published_parsed))  # the format of published_parsed is not..
                entry['published_parsed'] = dt                              # compactible with mongodb
                try:
                    mediaContnet = entry['media_content']
                except:
                    mediaContnet = None
                try:
                    content = entry['content'][0]['value']
                except:
                    content = None
                entry['image_link'] = self.getImage(media_content=mediaContnet,
                                                    summary=entry['summary'],
                                                    content=content,
                                                    link=entry['link'])
                self.mongo.insertFeeds(entry, site.id)

    def allFeeds(self,user_id, lastDate=None):
        if lastDate is not None:
            return self.mongo.selectFeeds(user_id=user_id,dateOfLastItem=lastDate)
        return self.mongo.selectFeeds(user_id=user_id)

    def md5Feeds(self, feed):
        '''
        find md5 of feed
        '''
        md5 = hashlib.md5(str(feed).encode('utf-8'))
        return md5.hexdigest()

    def selectFeedById(self, id):
        return self.mongo.selectFeedById(id)

    def getSiteTitle(self, siteId):
        siteObject = SiteInfo.objects.filter(id=siteId)
        for site in siteObject:
            return site.title

    def getSummary(self, summary):
        summary1000wds = strip_tags(summary)
        summary1000wds = summary1000wds[:300] + "..."
        return summary1000wds

    def getFullPost(self, summaryDetail):
        post = strip_tags(summaryDetail)
        return post

    def createLastModifiedStr(self, last_modified=None, etag=None):
        modiStr = None
        if etag is not None:
            modiStr = "etag = " + str(etag)
        if last_modified is not None:
            modiStr = "modified = " + str(last_modified.utctimetuple())
        return modiStr

    def findLastModifiedDate(self, feed):
        try:
            last_modified = datetime.fromtimestamp(mktime(feed.updated_parsed))  # if updated date is present
        except:
            try:
                last_modified = datetime.fromtimestamp(mktime(feed.date_parsed))  # if date field is present
            except:
                try:
                    # if published_parsed is present
                    last_modified = datetime.fromtimestamp(mktime(feed.published_parsed))
                except:
                    structTime = time.localtime()
                    last_modified = datetime(*structTime[:6])
        return last_modified

    def getFullPostURLOpen(self, link, summary):
        http = urllib3.PoolManager()
        #req = Request(link, headers={'User-Agent': "ireadr"})
        try:
            page = http.request('GET', link)
            page = page.data
            #page = urlopen(req)
        except:
            page = None
        if page is not None:
            soup = BeautifulSoup(page)
            summary = summary[:25]
            # modify this like check match for entire summary.
            #if not found find a substring of length 50 and check again
            # if again not found then reduce the length and try again
            element = soup.find(text=re.compile(summary))
            post = element.findParent('div')
            return post
        return None

    def findImgsrcFromHtml(self, content):
        soup = BeautifulSoup(content)
        img_links = soup.findAll('img')
        if len(img_links) > 0:
            for link in img_links:
                try:
                    if link['height'] == '1' or link['width'] == '1':
                        continue
                    else:
                        return (link['src'])
                except:
                    return (link['src'])
        return None

    def getImage(self, media_content=None, summary=None, content=None, link=None):
        if media_content is not None:
            return (media_content[0]['url'])
        # if media_content is None
        if summary is not None:
            return (self.findImgsrcFromHtml(summary))
        # if no matching image is found in summary
        if content is not None:
            return (self.findImgsrcFromHtml(content))
        # if image is not found in content then fetch the original page and extract the image
        if link is not None:
            post = self.getFullPostURLOpen(link, summary)
            if post is not None:
                return(self.findImgsrcFromHtml(post))
        return None