Exemplo n.º 1
0
def parse_arc(url, cur_year):
    
    if 'news' in url:
        base_url = "http://www.linux.org.ru/news/"
        links_re = re.compile(r'<a href="/news/(.+?)">')
    elif 'gallery' in url:
        base_url = "http://www.linux.org.ru/gallery/"
        links_re = re.compile(r'<a href="/gallery/(.+?)">')
    else:
        base_url = "http://www.linux.org.ru/forum/"
        links_re = re.compile(r'<a href="/forum/(.+?)">')
    data = load_url(url)
    links = links_re.findall(data)
    rlinks = []
    for l in links:
        if cur_year in l: # FIXME for january =)
            rlinks.append("%s%s" % (base_url, l))
    
    for rl in sorted(rlinks):
        print ("Loading arc: %s" % rl)
        data = load_url(rl)
        done = False
        while not done:
            n_link = next(pq(data).items('a[rel="next"]'), None)
            #print n_link
            if n_link:
                next_link = "http://www.linux.org.ru%s" % n_link.attr('href')
                rlinks.append(next_link)
                print "Loading %s" % next_link
                data = load_url(next_link)
            else:
                done = True
    return rlinks
Exemplo n.º 2
0
    def handle(self, *args, **options):
        for user in UserProfile.objects.filter(is_updated=False):
            print user.username
            try:
                page = load_url("http://www.linux.org.ru/people/%s/profile" %
                                user.username)
            except UnicodeEncodeError:  # FIXME
                user.is_updated = True
                user.save()
                continue

            if page == False:
                user.is_updated = True
                user.save()
                continue
            try:
                data = unicode(page.decode(encoding='UTF-8'))
            except UnicodeEncodeError:  # FIXME
                user.is_updated = True
                user.save()
                continue

            d = pq(data)
            avatar = d('div.userpic img.photo').attr('src')
            print user.username, avatar
            about = d('div#bd')
            user.avatar = avatar
            user.about = about
            user.is_updated = True
            user.save()
Exemplo n.º 3
0
Arquivo: new_scan.py Projeto: pi11/lor
def parse_arc(url):
    if 'news' in url:
        base_url = "http://www.linux.org.ru/news/"
        links_re = re.compile(r'<a href="/news/(.+?)">')
    elif 'gallery' in url:
        base_url = "http://www.linux.org.ru/gallery/"
        links_re = re.compile(r'<a href="/gallery/(.+?)">')
    else:
        base_url = "http://www.linux.org.ru/forum/"
        links_re = re.compile(r'<a href="/forum/(.+?)">')
    data = load_url(url)
    links = links_re.findall(data)
    rlinks = []
    for l in links:
        rlinks.append("%s%s" % (base_url, l))
    return rlinks
Exemplo n.º 4
0
def parse_forum(url):
    print "Parsing forum: %s" % url
    base_url = "http://www.linux.org.ru"
    parsed = False
    links = []
    new_url = url  # "%s%s" % (base_url, url)
    while not parsed:
        data = unicode(load_url(new_url))#.decode(encoding='UTF-8'))
        d = pq(data)
        parsed = True
        for l in d('td a').items():
            link = l.attr('href')
            text = l.text()
            if u"← предыдущие" in text or u"вперед →" in text:
                new_url = "%s%s" % (base_url, l.attr('href'))
                links.append(new_url)
                parsed = False
                break
    print "New links:", links
    return links
Exemplo n.º 5
0
def get_threads(link):
    # print "Thread: %s " % link

    data = unicode(load_url(link))#.decode(encoding='UTF-8'))
    d = pq(data)
    links = []
    for l in d('a').items():
        link = l.attr('href')
        text = l.text()
        if link is None or text is None:
            continue
        # print link, text
        if u"← назад" in text or u"вперед →" in text:
            continue

        if "/forum/" in link or "/news/" in link or "/gallery/":
            if not "?offset=" in link:
                links.append(
                    link.replace('#comments', '').replace('#cut0', ''))
    print "Threads links:", links
    return links
Exemplo n.º 6
0
def parse_thread(link, forum, page_id):
    if not forum:
        forum = get_forum_from_link(link)
    base_url = "http://www.linux.org.ru/forum/"
    base_url2 = "http://www.linux.org.ru"
    last_re = re.compile(r'(.*?)\?lastmod(.*?)')
    last_mod = last_re.findall(link)
    if len(last_mod) > 0:
        link = last_mod[0][0]
    start_url = link

    print "Parsing thread: %s" % start_url
    data = unicode(load_url(start_url))#.decode(encoding='UTF-8'))
    d = pq(data)
    first_mess = d('article:first')
    title, message, topic, op_profile, op_t = parse_mess(first_mess)
    if topic is None:
        print "Topic is NONE!!! WTF!?? Ignore such thread for now"
        return data

    topic_links = d('ul#topicMenu a').items()
    thread_url = False
    for pl in topic_links:
        # print ">>", pl.text(), pl
        if pl.text() == u"Ссылка":
            thread_url = "%s%s" % (base_url2, pl.attr('href'))

    if not thread_url:
        print "Topic links", topic_links
        raise

    if settings.DEBUG:
        try:
            print (u"user: %s, forum: %s, title: [%s] \n url %s,"
                   u" start_url:%s, time: %s, topic:%s ") % (op_profile, forum, title,
                                                             thread_url, start_url, op_t, topic)
            pass
        except UnicodeDecodeError:
            print "Unicode error"
            pass

    try:
        thread = Thread.objects.get(url=thread_url)
    except Thread.DoesNotExist:
        print "New thread! - %s" % thread_url
        thread = Thread(user=op_profile, forum=forum,
                        title=title, url=thread_url,
                        lor_id=topic, publication_date=op_t)
        thread.thread_url = start_url
        thread.save()

    j = 0
    rpage_id = page_id + 1
    comments_count = 0
    for comment in d('article').items():
        comments_count += 1
        title, message, topic, op_profile, op_t = parse_mess(comment)
        if topic is None:
            print "=" * 20
            print "Topic is none, continue..."
            continue

        try:
            mes, cr = Message.objects.get_or_create(user=op_profile,
                                                    lor_message_id=topic,
                                                    forum=forum, thread=thread)
        except Message.MultipleObjectsReturned:
            k = 0
            for m in Message.objects.filter(user=op_profile,
                                            lor_message_id=topic,
                                            forum=forum, thread=thread).order_by("id"):
                if k == 0:
                    mes = m
                    cr = False
                else:
                    m.delete()
                k += 1
        except IntegrityError:
            # if already exists, try to get it
            mes, cr = Message.objects.get_or_create(user=op_profile,
                                                    lor_message_id=topic,
                                                    forum=forum, thread=thread)

            continue

        if cr:
            mes.message_id = j * rpage_id
            mes.publication_date = op_t
            ms = MessageStore(text=message, ms=mes)
            ms.save()
        if j == 0 and page_id == 0:
            print "Top message..."
            mes.is_op = True
        try:
            mes.save()
        except IntegrityError:
            print "Error saving message!"
            continue

        j += 1
    print "Comments: %s" % comments_count

    p, c = ParsedUrls.objects.get_or_create(url=start_url)
    return data