示例#1
0
def get_kongs_urls(soup, section_or_subsection):
    # Get a list of (topic_url) of each topic that is marked with a kong
    kongs_list = []
    if SCRIPT_MODE == 'UPDATE':
        for kong_img in soup.fetch('img',
                                   {'src': 'images/misc/stafftracker.png'}):

            # Get latest post time of this kong_thread
            thread_last_post_time = kong_img.findParent('tr').findChild(
                'span', {
                    'class': 'time'
                }).findParent('div').text
            stripped_last_post_time = re.findall('(.*?..:.. .M)',
                                                 thread_last_post_time)[0]
            last_post_time = uniform_date(stripped_last_post_time)
            last_post_datetime = datetime.datetime.strptime(
                last_post_time, '%m-%d-%Y%I:%M %p')

            # Check if the kong_thread is a Sticky
            if len(
                    kong_img.findParent('div').findChildren(
                        'img', alt='Sticky Thread')) > 0:
                is_sticky = True
            else:
                is_sticky = False

            # Check the number of kong_posts in this kong_thread
            kong_thread_title = kong_img.findParent('tr').fetch(
                'a', {'id': re.compile('thread_title_.*?')})[0].text
            kong_thread_id = int(
                re.findall('.*?p=(.*?)#post',
                           kong_img.findParent('a')['href'])[0])

            if KongThread.objects.filter(
                    thread_id=kong_thread_id,
                    title__contains=kong_thread_title).count() == 1:
                kong_thread = KongThread.objects.get(
                    thread_id=kong_thread_id,
                    title__contains=kong_thread_title)
                kong_posts_number = int(
                    re.findall('(.*?) Staff Post...', kong_img['alt'])[0])
                # If there are less or equal number of kong posts in this kong thread, ignore it
                if kong_posts_number <= KongThread.objects.get(
                        thread_id=kong_thread_id,
                        title__contains=kong_thread_title).kongpost_set.count(
                        ):
                    try:
                        print get_decoded_text(
                            u'Nothing new on thread %s !' %
                            get_decoded_text(kong_thread_title))
                    except:
                        print "******* Uh oh, can't write the thread title down :("

                    if section_or_subsection.threads.count() and \
                    last_post_datetime < section_or_subsection.threads.filter(last_modified__isnull = False).latest('last_modified').last_modified and \
                    not is_sticky:
                        kongs_list.append('STOP')
                        break

                else:
                    kongs_list.append(kong_img.findParent('a')['href'])
            else:
                kongs_list.append(kong_img.findParent('a')['href'])
        else:

            # Get latest post time of first thread in this section page
            # if there is any thread on this page...
            page_threads = soup.fetch('td',
                                      {'id': re.compile("td_threadtitle_.*?")})
            if len(page_threads) > 0 and not page_threads[0].findParent(
                    'tr').findChild('span', {'class': 'time'}) == None:
                thread_last_post_time = page_threads[0].findParent(
                    'tr').findChild('span', {
                        'class': 'time'
                    }).findParent('div').text
                stripped_last_post_time = re.findall('(.*?..:.. .M)',
                                                     thread_last_post_time)[0]
                last_post_time = uniform_date(stripped_last_post_time)
                last_post_datetime = datetime.datetime.strptime(
                    last_post_time, '%m-%d-%Y%I:%M %p')

                # Check if the first thread is a Sticky
                if len(page_threads[0].findParent('tr').findChildren(
                        'img', alt='Sticky Thread')) > 0:
                    is_sticky = True
                else:
                    is_sticky = False

# If the last post of the first thread is older than our last kong for this section/subsection, just stop
                if section_or_subsection.threads.count() and \
                last_post_datetime < section_or_subsection.threads.filter(last_modified__isnull = False).latest('last_modified').last_modified and \
                not is_sticky:
                    kongs_list.append('STOP')

# If the last post of the first thread is older than one week, just stop
                if last_post_datetime < datetime.datetime.now(
                ) - relativedelta(weeks=1):
                    kongs_list.append('STOP')

    elif SCRIPT_MODE == 'CREATE':
        for kong_img in soup.fetch('img',
                                   {'src': 'images/misc/stafftracker.png'}):
            kongs_list.append(kong_img.findParent('a')['href'])

    return kongs_list
示例#2
0
def get_kongs_urls(soup, section_or_subsection):
    # Get a list of (topic_url) of each topic that is marked with a kong
    kongs_list = []
    if SCRIPT_MODE == 'UPDATE':
        for kong_img in soup.fetch('img', {'src' : 'images/misc/stafftracker.png' }):
        
            # Get latest post time of this kong_thread
            thread_last_post_time = kong_img.findParent('tr').findChild('span', {'class' : 'time'}).findParent('div').text
            stripped_last_post_time = re.findall('(.*?..:.. .M)', thread_last_post_time)[0]
            last_post_time = uniform_date(stripped_last_post_time)
            last_post_datetime = datetime.datetime.strptime(last_post_time, '%m-%d-%Y%I:%M %p')
            
            # Check if the kong_thread is a Sticky
            if len(kong_img.findParent('div').findChildren('img',  alt='Sticky Thread')) > 0:
                is_sticky = True
            else:
                is_sticky = False
                
            # Check the number of kong_posts in this kong_thread
            kong_thread_title = kong_img.findParent('tr').fetch('a', {'id' : re.compile('thread_title_.*?')})[0].text
            kong_thread_id = int(re.findall('.*?p=(.*?)#post', kong_img.findParent('a')['href'])[0])

            if KongThread.objects.filter(thread_id = kong_thread_id, 
                                                        title__contains = kong_thread_title).count() == 1:
                kong_thread = KongThread.objects.get(thread_id = kong_thread_id, 
                                                        title__contains = kong_thread_title)
                kong_posts_number = int(re.findall('(.*?) Staff Post...', kong_img['alt'])[0])
                # If there are less or equal number of kong posts in this kong thread, ignore it
                if kong_posts_number <= KongThread.objects.get(thread_id = kong_thread_id, 
                                                                                                title__contains = kong_thread_title).kongpost_set.count():
		    try:
	                    print get_decoded_text(u'Nothing new on thread %s !'%get_decoded_text(kong_thread_title))
		    except:
                            print "******* Uh oh, can't write the thread title down :("
                    
                    if section_or_subsection.threads.count() and \
                    last_post_datetime < section_or_subsection.threads.filter(last_modified__isnull = False).latest('last_modified').last_modified and \
                    not is_sticky:
                        kongs_list.append('STOP')
                        break
                
                else:
                    kongs_list.append(kong_img.findParent('a')['href'])
            else:
                kongs_list.append(kong_img.findParent('a')['href'])
        else:
            
            # Get latest post time of first thread in this section page
            # if there is any thread on this page...
            page_threads = soup.fetch('td', {'id' : re.compile("td_threadtitle_.*?")})
            if len(page_threads) > 0 and not page_threads[0].findParent('tr').findChild('span', {'class' : 'time'}) == None:
                thread_last_post_time = page_threads[0].findParent('tr').findChild('span', {'class' : 'time'}).findParent('div').text
                stripped_last_post_time = re.findall('(.*?..:.. .M)', thread_last_post_time)[0]
                last_post_time = uniform_date(stripped_last_post_time)
                last_post_datetime = datetime.datetime.strptime(last_post_time, '%m-%d-%Y%I:%M %p')
                
                # Check if the first thread is a Sticky
                if len(page_threads[0].findParent('tr').findChildren('img', alt='Sticky Thread')) > 0:
                    is_sticky = True
                else:
                    is_sticky = False
                
		# If the last post of the first thread is older than our last kong for this section/subsection, just stop
                if section_or_subsection.threads.count() and \
                last_post_datetime < section_or_subsection.threads.filter(last_modified__isnull = False).latest('last_modified').last_modified and \
                not is_sticky:
                    kongs_list.append('STOP')

		# If the last post of the first thread is older than one week, just stop
                if last_post_datetime < datetime.datetime.now() - relativedelta(weeks = 1):
                    kongs_list.append('STOP')
            
    elif SCRIPT_MODE == 'CREATE':
        for kong_img in soup.fetch('img', {'src' : 'images/misc/stafftracker.png' }):
            kongs_list.append(kong_img.findParent('a')['href'])
    
	
    return kongs_list
示例#3
0
def fetch_kong_posts(kong_post_url, forum_section):
    # Given a particular first_kong_post_url, get the HTMLSource and retrieve the other kong posts!

    print 'KONG_POST_URL : %s' % kong_post_url

    # Prepare request and launch it
    kong_post_req = urllib2.Request(kong_post_url)
    kong_post_req.add_header('User-Agent', 'Mozilla 5.0')
    kong_post_string = ''
    while not kong_post_string:
        try:
            kong_post_string = urllib2.urlopen(kong_post_req).read()
        except:
            print "******* Error while fetching kong_post_string! Let's get it again ! ********"

    kong_thread_id = re.findall('.php\?p=(.*?)#post', kong_post_url)[0]

    # if id doesnt exist, RETURN
    if not kong_thread_id:
        return

    kong_thread_url = kong_post_url.split('#post')[0]

    # Build a beautiful soup with the HTML Source
    soup = BeautifulSoup.BeautifulSoup(kong_post_string)

    # Create or retrieve kong thread
    kong_thread, created = KongThread.objects.get_or_create(
        thread_id=int(kong_thread_id),
        object_id=forum_section.id,
        content_type=ContentType.objects.get_for_model(forum_section),
        link=kong_thread_url,
        title=soup.fetch('td', 'navbar')[0].text)

    if created:
        try:
            print 'New Kong Thread created ! %s - %s' % (kong_thread.title,
                                                         kong_thread.id)
        except:
            print 'New Kong Thread created ! ID : %s but couldnt write its title :(' % kong_thread.id

    # Get the kong_post !
    while True:

        kong_post_id = kong_post_url.split('#post')[1]

        # if id doesnt exist, BREAK
        if not kong_post_id:
            break
        kong_post_tables = soup.fetch('table', id='post%s' % kong_post_id)
        if len(kong_post_tables) == 0:
            break
        else:
            kong_post_table = kong_post_tables[0]

        post_date_td = kong_post_table.fetch('td', {'class': 'thead'})[0]

        post_date = re.findall('date(.*?, ..:.. .M)', post_date_td.text)[0]

        post_div = kong_post_table.fetch(
            'div', {'id': 'postmenu_%s' % kong_post_id})[0]
        temp_datetime = uniform_date(post_date)
        post_datetime = datetime.datetime.strptime(temp_datetime,
                                                   '%m-%d-%Y, %I:%M %p')

        post_author = post_div.fetch('a')[0].text
        if 'formatting' in post_author:
            post_author_bis = post_author
            post_author = re.findall(
                'Username formatting(.*?)/Username formatting',
                post_author_bis)[0]

            # Create a kong post and fill in its attributes
        already_created = KongPost.objects.filter(
            author=post_author,
            forum_id=kong_post_id,
            date=post_datetime,
            kong_thread=kong_thread,
            link=kong_post_url).count() > 0

        if not already_created:
            kong_post = KongPost.objects.create(
                author=post_author,
                forum_id=kong_post_id,
                date=post_datetime,
                kong_thread=kong_thread,
                link=kong_post_url,
                message=kong_post_table.fetch('div',
                                              id='post_message_%s' %
                                              kong_post_id)[0])
            try:
                print 'New KongPost created ! topic : %s %s - id : %s' % (
                    get_decoded_text(kong_thread.title),
                    post_datetime.strftime('%d/%m/%Y %H:%M'), kong_post.id)
            except:
                print "New KongPost created but can't write its title :("
            kong_thread.last_modified = post_datetime
            kong_thread.save()
        else:
            try:
                print 'KongPost already exist on topic %s !' % get_decoded_text(
                    kong_thread.title)
            except:
                print "KongPost already exist on topic, but can't write topic title :("

        # If there is a next kong post, here we go again ! or not :(
        next_kong_post_imgs = kong_post_table.fetch(
            'img',
            alt='Click here to go to the next staff post in this thread.')
        if len(next_kong_post_imgs) == 0:
            break
        else:
            next_kong_post_url = next_kong_post_imgs[0].findParent('a')['href']
            print 'NEXT KONG POST URL : %s' % next_kong_post_url
            # Check for kong post loops
            if next_kong_post_url in kong_post_url:
                return
            # If the link redirects to a new thread
            if 'showthread' in next_kong_post_url:
                kong_post_url = kong_post_url.split(
                    'showthread')[0] + next_kong_post_url
                #                kong_thread_url = kong_post_url.split('#post')[0]
                new_req = urllib2.Request(kong_post_url)
                new_req.add_header('User-Agent', 'Mozilla 5.0')
                soup = ''
                while not soup:
                    try:
                        soup = BeautifulSoup.BeautifulSoup(
                            urllib2.urlopen(new_req).read())
                    except:
                        print "*** Erreur en demandant la soupe ! ***"
            else:
                kong_post_url = kong_post_url.split(
                    '#')[0] + '#' + next_kong_post_url.split('#')[1]
            print "NEW KONG POST URL : %s" % kong_post_url
示例#4
0
def fetch_kong_posts(kong_post_url, forum_section):
    # Given a particular first_kong_post_url, get the HTMLSource and retrieve the other kong posts!

    print 'KONG_POST_URL : %s'%kong_post_url

    # Prepare request and launch it
    kong_post_req = urllib2.Request(kong_post_url)
    kong_post_req.add_header('User-Agent', 'Mozilla 5.0')
    kong_post_string = ''
    while not kong_post_string:
        try:
            kong_post_string = urllib2.urlopen(kong_post_req).read()
        except:
            print "******* Error while fetching kong_post_string! Let's get it again ! ********"
    
    kong_thread_id = re.findall('.php\?p=(.*?)#post', kong_post_url)[0]
    
    # if id doesnt exist, RETURN
    if not kong_thread_id:
        return
        
    kong_thread_url = kong_post_url.split('#post')[0]
    
    # Build a beautiful soup with the HTML Source
    soup = BeautifulSoup.BeautifulSoup(kong_post_string)
    
    # Create or retrieve kong thread
    kong_thread, created = KongThread.objects.get_or_create(thread_id = int(kong_thread_id), 
                                                                                        object_id = forum_section.id, 
                                                                                        content_type = ContentType.objects.get_for_model(forum_section), 
                                                                                        link = kong_thread_url, 
                                                                                        title = soup.fetch('td', 'navbar')[0].text
                                                                                        )
                                                                                        
    if created :
        try:
            print 'New Kong Thread created ! %s - %s'%(kong_thread.title,  kong_thread.id)
        except:
            print 'New Kong Thread created ! ID : %s but couldnt write its title :('%kong_thread.id

    # Get the kong_post !
    while True:
        
        kong_post_id = kong_post_url.split('#post')[1]
        
        # if id doesnt exist, BREAK
        if not kong_post_id:
            break
        kong_post_tables = soup.fetch('table', id='post%s'%kong_post_id)
        if len(kong_post_tables) == 0:
            break
        else:
            kong_post_table = kong_post_tables[0]
    
        post_date_td = kong_post_table.fetch('td', {'class' : 'thead'})[0]
                
        post_date = re.findall('date(.*?, ..:.. .M)', post_date_td.text)[0]
        
        post_div = kong_post_table.fetch('div',  {'id' : 'postmenu_%s'%kong_post_id})[0]
        temp_datetime = uniform_date(post_date)
        post_datetime = datetime.datetime.strptime(temp_datetime, '%m-%d-%Y, %I:%M %p')

	post_author = post_div.fetch('a')[0].text
	if 'formatting' in post_author:
		post_author_bis = post_author
		post_author = re.findall('Username formatting(.*?)/Username formatting', post_author_bis)[0]

		# Create a kong post and fill in its attributes
        already_created = KongPost.objects.filter(author = post_author,
                                                                    forum_id = kong_post_id, 
                                                                    date = post_datetime , 
                                                                    kong_thread = kong_thread,  
                                                                    link = kong_post_url).count() > 0

        if not already_created:
            kong_post = KongPost.objects.create(author = post_author,
                                                                    forum_id = kong_post_id, 
                                                                    date = post_datetime , 
                                                                    kong_thread = kong_thread,  
                                                                    link = kong_post_url, 
                                                                    message = kong_post_table.fetch('div', id='post_message_%s'%kong_post_id)[0])
            try :
                print 'New KongPost created ! topic : %s %s - id : %s'%(get_decoded_text(kong_thread.title), post_datetime.strftime('%d/%m/%Y %H:%M'), kong_post.id)
            except :
                print "New KongPost created but can't write its title :("
            kong_thread.last_modified = post_datetime
            kong_thread.save()
        else:
            try :
                print 'KongPost already exist on topic %s !'%get_decoded_text(kong_thread.title)
            except :
                print "KongPost already exist on topic, but can't write topic title :("
        
        # If there is a next kong post, here we go again ! or not :(
        next_kong_post_imgs = kong_post_table.fetch('img', alt='Click here to go to the next staff post in this thread.')		
        if len(next_kong_post_imgs) == 0:
            break
        else:
            next_kong_post_url = next_kong_post_imgs[0].findParent('a')['href']
            print 'NEXT KONG POST URL : %s'%next_kong_post_url
            # Check for kong post loops
            if next_kong_post_url in kong_post_url:
                return
            # If the link redirects to a new thread
            if 'showthread' in next_kong_post_url:
                kong_post_url = kong_post_url.split('showthread')[0] + next_kong_post_url
#                kong_thread_url = kong_post_url.split('#post')[0]
                new_req = urllib2.Request(kong_post_url)
                new_req.add_header('User-Agent', 'Mozilla 5.0')
                soup = ''
                while not soup:
                    try:
                        soup = BeautifulSoup.BeautifulSoup(urllib2.urlopen(new_req).read())
                    except:
                        print "*** Erreur en demandant la soupe ! ***"
            else:
                kong_post_url = kong_post_url.split('#')[0]+'#'+next_kong_post_url.split('#')[1]
            print "NEW KONG POST URL : %s"%kong_post_url