Пример #1
0
    def _export_(self):
        self.logger.debug('Récupération du forum %s (page %d)',
                          self.forum.oldid, self.page)

        # Download the page
        response = self.session.get("/{}p{}-a".format(self.forum.oldid,
                                                      self.page))
        document = PyQuery(response.text)

        # Get the topics
        for element in document.find('div.topictitle'):
            e = PyQuery(element)

            topic_id = int(
                re.search(r"/t(\d+)-.*",
                          clean_url(e("a").attr("href"))).group(1))
            if topic_id not in self.announcements:
                topic_slug = re.search(r"/t(\d+)-(.*)",
                                       clean_url(e("a").attr("href"))).group(2)
                f = e.parents().eq(-2)
                locked = 1 if ("verrouillé"
                               in f("td img").eq(0).attr("alt")) else 0
                views = int(f("td").eq(5).text())
                topic_type = TOPIC_TYPES.get(e("strong").text(), 0)
                title = e("a").text()

                self.add_child(
                    Topic(topic_id, topic_type, title, locked, views,
                          topic_slug))
                if topic_type >= 2:
                    # The topic is an announcement, save its id to avoid exporting it again
                    self.announcements.append(topic_id)
Пример #2
0
def get_topics():
    logging.info('Récupération des sujets')
    if config.debug:
        progress = progressbar.NoProgressBar()
    else:
        progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=save.nbtopics)
    progress.start()

    n = len(save.topics)
    
    ids = [i["id"] for i in save.topics]
    
    for forum in [i for i in save.forums if (i["type"] == "f" and i["parsed"] == False)]:
        logging.debug('Récupération : sujets du forum %d', forum["id"])
        subtopics = []
        subids = []
        d = PyQuery(url=config.rooturl + '/' + forum['type'] + str(forum['id']) + '-a', opener=fa_opener)
        result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text())

        try:
            pages = int(result.group(1))
            topicsperpages = int(result.group(2))
        except:
            pages = 1
            topicsperpages = 0
            
        for page in range(0,pages):
            if page >= 1:
                d = PyQuery(url=config.rooturl + '/' + forum['type'] + str(forum['id']) + 'p' + str(page*topicsperpages) + '-a', opener=fa_opener)

            for i in d.find('div.topictitle'):
                e = PyQuery(i)
                
                id = int(re.search("/t(\d+)-.*", e("a").attr("href")).group(1))
                if id not in ids and id not in subids:
                    logging.debug('Récupération : sujet %d', id)
                    f = e.parents().eq(-2)
                    locked = u"verrouillé" in f("td img").eq(0).attr("alt")
                    views = int(f("td").eq(5).text())
                    subtopics.append({'id': id, 'type': e("strong").text(), 'parent': forum['newid'], 'title': e("a").text(), 'locked': locked, 'views': views, 'parsed': False})
                    subids.append(id)
                    
                    n += 1
                    progress.update(n)
                else:
                    logging.warning('Le sujet %d a déjà été récupéré.', id)
        save.topics.extend(subtopics)
        ids.extend(subids)
        [i for i in save.forums if i == forum][0]["parsed"] = True
    progress.end()
Пример #3
0
def parsea_data(raw):
    d = PyQuery(raw)
    elems = d('div.mb4.w-100.w-25-l.w-50-m')
    for el in elems.items():
        titleEl = PyQuery(el.find("h3.mt3.mb0.b"))
        imgEl = PyQuery(el.find("div[data-bg-src]"))
        groupEl = PyQuery(el.find("span.f7"))
        userEl = PyQuery(el.find("span.f7>a"))
        # title
        title = titleEl.text()
        # img
        image = imgEl.attr("data-bg-src")
        # get group
        byIdx = groupEl.text().index(" By ")
        group = groupEl.text()[:byIdx]
        # user
        user_name = userEl.text()
        user_link = userEl.attr("href")
        # type
        icon = PyQuery(el.find("i.material-icons.v-mid"))
        print(icon.next())
        if icon.text() == 'timelapse':
            type = 'sales'
        else:
            type = 'subscription'

        info = icon.parents('span').text()
        info = info.replace('timelapse', '')
        info = info.replace('all_inclusive', '')

        # price
        priceEl = PyQuery(
            el.find(
                "div.w-100.absolute.bottom-0.mb3.black>div:nth-of-type(3)"))
        price = priceEl.text()
        list.append({
            'title': title,
            'image': image,
            'group': group,
            'user': {
                'name': user_name,
                'link': user_link
            },
            'type': type,
            'info': info,
            'price': price
        })
Пример #4
0
    def _export_(self):
        self.logger.debug("Récupération du forum %s (page %d)", self.forum.oldid, self.page)

        # Download the page
        response = self.session.get("/{}p{}-a".format(self.forum.oldid, self.page))
        document = PyQuery(response.text)

        # Get the topics
        for element in document.find("div.topictitle"):
            e = PyQuery(element)

            topic_id = int(re.search(r"/t(\d+)-.*", clean_url(e("a").attr("href"))).group(1))
            if topic_id not in self.announcements:
                f = e.parents().eq(-2)
                locked = 1 if ("verrouillé" in f("td img").eq(0).attr("alt")) else 0
                views = int(f("td").eq(5).text())
                topic_type = TOPIC_TYPES.get(e("strong").text(), 0)
                title = e("a").text()

                self.add_child(Topic(topic_id, topic_type, title, locked, views))
                if topic_type >= 2:
                    # The topic is an announcement, save its id to avoid exporting it again
                    self.announcements.append(topic_id)
Пример #5
0
def get_topics():
    logging.info("Récupération des sujets")
    if config.debug:
        progress = progressbar.NoProgressBar()
    else:
        progress = progressbar.ProgressBar(
            widgets=[progressbar.SimpleProgress("/"), " ", progressbar.Bar("#", "[", "]"), progressbar.Percentage()],
            maxval=save.nbtopics,
        )
    progress.start()

    n = len(save.topics)

    ids = [i["id"] for i in save.topics]

    for forum in [i for i in save.forums if (i["type"] == "f" and i["parsed"] == False)]:
        logging.debug("Récupération : sujets du forum %d", forum["id"])
        subtopics = []
        subids = []
        d = PyQuery(url=config.rooturl + "/" + forum["type"] + str(forum["id"]) + "-a", opener=fa_opener)
        result = re.search(
            "function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}",
            d.text(),
        )

        try:
            pages = int(result.group(1))
            topicsperpages = int(result.group(2))
        except:
            pages = 1
            topicsperpages = 0

        for page in range(0, pages):
            if page >= 1:
                d = PyQuery(
                    url=config.rooturl
                    + "/"
                    + forum["type"]
                    + str(forum["id"])
                    + "p"
                    + str(page * topicsperpages)
                    + "-a",
                    opener=fa_opener,
                )

            for i in d.find("div.topictitle"):
                e = PyQuery(i)

                id = int(re.search("/t(\d+)-.*", e("a").attr("href")).group(1))
                if id not in ids and id not in subids:
                    logging.debug("Récupération : sujet %d", id)
                    f = e.parents().eq(-2)
                    locked = u"verrouillé" in f("td img").eq(0).attr("alt")
                    views = int(f("td").eq(5).text())
                    subtopics.append(
                        {
                            "id": id,
                            "type": e("strong").text(),
                            "parent": forum["newid"],
                            "title": e("a").text(),
                            "locked": locked,
                            "views": views,
                            "parsed": False,
                        }
                    )
                    subids.append(id)

                    n += 1
                    progress.update(n)
                else:
                    logging.warning("Le sujet %d a déjà été récupéré.", id)
        save.topics.extend(subtopics)
        ids.extend(subids)
        [i for i in save.forums if i == forum][0]["parsed"] = True
    progress.end()
Пример #6
0
 def parse_items(self, urls):
     docs = []
     threads = [
         threading.Thread(target=get, args=(url, docs)) for url in urls
     ]
     for thread in threads:
         thread.start()
     for thread in threads:
         thread.join()
     for item_doc in docs:
         word_id = None
         match = re.search("notSatisfied(Lang)?\( ?'(\d+)' ?[,\)]",
                           item_doc.html())
         if match:
             word_id = match.group(2)
         for locale in item_doc("article.pronunciations"):
             locale = PyQuery(locale)
             lang_header = locale('header[id=%s]' % self.lang.split('_')[0])
             if lang_header:
                 word = re.compile(r"(.*) の発音").search(
                     lang_header.text()).group(1)
                 if self.lang == 'en_usa':
                     els = locale('header[id=%s]' % self.lang).next_all()
                 else:
                     els = locale('.show-all-pronunciations li')
                 lis = []
                 for el in els:
                     el = PyQuery(el)
                     if el.has_class('li-ad'):
                         continue
                     if el.is_('header'):
                         break
                     lis.append(el)
                 for li in lis:
                     i = PyQuery(li('span.play'))
                     text = i.parents('li').eq(0).text()
                     user = None
                     match = re.search("発音したユーザ: (.*) \(", text)
                     if match:
                         user = match.group(1)
                     onclick = i.attr('onclick')
                     match = re.compile(r"Play\(.*,'(.*)',.*,.*,.*,.*,.*\)"
                                        ).search(onclick)
                     if match:
                         code = match.group(1)
                         url = 'https://audio00.forvo.com/mp3/' + \
                             base64_decode(code)
                         self.results.append({
                             'word': word,
                             'url': url,
                             'word_id': word_id,
                             'user': user
                         })
                     else:
                         match = re.compile(
                             r"PlayPhrase\(.*,'(.*)',.*\)").search(onclick)
                         if match:
                             code = match.group(1)
                             url = 'https://audio00.forvo.com/phrases/mp3/' + \
                                 base64_decode(code)
                             self.results.append({
                                 'word': word,
                                 'url': url,
                                 'word_id': word_id,
                                 'user': user
                             })