def parse_arc(url, cur_year): if 'news' in url: base_url = "http://www.linux.org.ru/news/" links_re = re.compile(r'<a href="/news/(.+?)">') elif 'gallery' in url: base_url = "http://www.linux.org.ru/gallery/" links_re = re.compile(r'<a href="/gallery/(.+?)">') else: base_url = "http://www.linux.org.ru/forum/" links_re = re.compile(r'<a href="/forum/(.+?)">') data = load_url(url) links = links_re.findall(data) rlinks = [] for l in links: if cur_year in l: # FIXME for january =) rlinks.append("%s%s" % (base_url, l)) for rl in sorted(rlinks): print ("Loading arc: %s" % rl) data = load_url(rl) done = False while not done: n_link = next(pq(data).items('a[rel="next"]'), None) #print n_link if n_link: next_link = "http://www.linux.org.ru%s" % n_link.attr('href') rlinks.append(next_link) print "Loading %s" % next_link data = load_url(next_link) else: done = True return rlinks
def handle(self, *args, **options): for user in UserProfile.objects.filter(is_updated=False): print user.username try: page = load_url("http://www.linux.org.ru/people/%s/profile" % user.username) except UnicodeEncodeError: # FIXME user.is_updated = True user.save() continue if page == False: user.is_updated = True user.save() continue try: data = unicode(page.decode(encoding='UTF-8')) except UnicodeEncodeError: # FIXME user.is_updated = True user.save() continue d = pq(data) avatar = d('div.userpic img.photo').attr('src') print user.username, avatar about = d('div#bd') user.avatar = avatar user.about = about user.is_updated = True user.save()
def parse_arc(url): if 'news' in url: base_url = "http://www.linux.org.ru/news/" links_re = re.compile(r'<a href="/news/(.+?)">') elif 'gallery' in url: base_url = "http://www.linux.org.ru/gallery/" links_re = re.compile(r'<a href="/gallery/(.+?)">') else: base_url = "http://www.linux.org.ru/forum/" links_re = re.compile(r'<a href="/forum/(.+?)">') data = load_url(url) links = links_re.findall(data) rlinks = [] for l in links: rlinks.append("%s%s" % (base_url, l)) return rlinks
def parse_forum(url): print "Parsing forum: %s" % url base_url = "http://www.linux.org.ru" parsed = False links = [] new_url = url # "%s%s" % (base_url, url) while not parsed: data = unicode(load_url(new_url))#.decode(encoding='UTF-8')) d = pq(data) parsed = True for l in d('td a').items(): link = l.attr('href') text = l.text() if u"← предыдущие" in text or u"вперед →" in text: new_url = "%s%s" % (base_url, l.attr('href')) links.append(new_url) parsed = False break print "New links:", links return links
def get_threads(link): # print "Thread: %s " % link data = unicode(load_url(link))#.decode(encoding='UTF-8')) d = pq(data) links = [] for l in d('a').items(): link = l.attr('href') text = l.text() if link is None or text is None: continue # print link, text if u"← назад" in text or u"вперед →" in text: continue if "/forum/" in link or "/news/" in link or "/gallery/": if not "?offset=" in link: links.append( link.replace('#comments', '').replace('#cut0', '')) print "Threads links:", links return links
def parse_thread(link, forum, page_id): if not forum: forum = get_forum_from_link(link) base_url = "http://www.linux.org.ru/forum/" base_url2 = "http://www.linux.org.ru" last_re = re.compile(r'(.*?)\?lastmod(.*?)') last_mod = last_re.findall(link) if len(last_mod) > 0: link = last_mod[0][0] start_url = link print "Parsing thread: %s" % start_url data = unicode(load_url(start_url))#.decode(encoding='UTF-8')) d = pq(data) first_mess = d('article:first') title, message, topic, op_profile, op_t = parse_mess(first_mess) if topic is None: print "Topic is NONE!!! WTF!?? Ignore such thread for now" return data topic_links = d('ul#topicMenu a').items() thread_url = False for pl in topic_links: # print ">>", pl.text(), pl if pl.text() == u"Ссылка": thread_url = "%s%s" % (base_url2, pl.attr('href')) if not thread_url: print "Topic links", topic_links raise if settings.DEBUG: try: print (u"user: %s, forum: %s, title: [%s] \n url %s," u" start_url:%s, time: %s, topic:%s ") % (op_profile, forum, title, thread_url, start_url, op_t, topic) pass except UnicodeDecodeError: print "Unicode error" pass try: thread = Thread.objects.get(url=thread_url) except Thread.DoesNotExist: print "New thread! - %s" % thread_url thread = Thread(user=op_profile, forum=forum, title=title, url=thread_url, lor_id=topic, publication_date=op_t) thread.thread_url = start_url thread.save() j = 0 rpage_id = page_id + 1 comments_count = 0 for comment in d('article').items(): comments_count += 1 title, message, topic, op_profile, op_t = parse_mess(comment) if topic is None: print "=" * 20 print "Topic is none, continue..." continue try: mes, cr = Message.objects.get_or_create(user=op_profile, lor_message_id=topic, forum=forum, thread=thread) except Message.MultipleObjectsReturned: k = 0 for m in Message.objects.filter(user=op_profile, lor_message_id=topic, forum=forum, thread=thread).order_by("id"): if k == 0: mes = m cr = False else: m.delete() k += 1 except IntegrityError: # if already exists, try to get it mes, cr = Message.objects.get_or_create(user=op_profile, lor_message_id=topic, forum=forum, thread=thread) continue if cr: mes.message_id = j * rpage_id mes.publication_date = op_t ms = MessageStore(text=message, ms=mes) ms.save() if j == 0 and page_id == 0: print "Top message..." mes.is_op = True try: mes.save() except IntegrityError: print "Error saving message!" continue j += 1 print "Comments: %s" % comments_count p, c = ParsedUrls.objects.get_or_create(url=start_url) return data