def _export_(self): self.logger.debug('Récupération du forum %s (page %d)', self.forum.oldid, self.page) # Download the page response = self.session.get("/{}p{}-a".format(self.forum.oldid, self.page)) document = PyQuery(response.text) # Get the topics for element in document.find('div.topictitle'): e = PyQuery(element) topic_id = int( re.search(r"/t(\d+)-.*", clean_url(e("a").attr("href"))).group(1)) if topic_id not in self.announcements: topic_slug = re.search(r"/t(\d+)-(.*)", clean_url(e("a").attr("href"))).group(2) f = e.parents().eq(-2) locked = 1 if ("verrouillé" in f("td img").eq(0).attr("alt")) else 0 views = int(f("td").eq(5).text()) topic_type = TOPIC_TYPES.get(e("strong").text(), 0) title = e("a").text() self.add_child( Topic(topic_id, topic_type, title, locked, views, topic_slug)) if topic_type >= 2: # The topic is an announcement, save its id to avoid exporting it again self.announcements.append(topic_id)
def get_topics(): logging.info('Récupération des sujets') if config.debug: progress = progressbar.NoProgressBar() else: progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=save.nbtopics) progress.start() n = len(save.topics) ids = [i["id"] for i in save.topics] for forum in [i for i in save.forums if (i["type"] == "f" and i["parsed"] == False)]: logging.debug('Récupération : sujets du forum %d', forum["id"]) subtopics = [] subids = [] d = PyQuery(url=config.rooturl + '/' + forum['type'] + str(forum['id']) + '-a', opener=fa_opener) result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text()) try: pages = int(result.group(1)) topicsperpages = int(result.group(2)) except: pages = 1 topicsperpages = 0 for page in range(0,pages): if page >= 1: d = PyQuery(url=config.rooturl + '/' + forum['type'] + str(forum['id']) + 'p' + str(page*topicsperpages) + '-a', opener=fa_opener) for i in d.find('div.topictitle'): e = PyQuery(i) id = int(re.search("/t(\d+)-.*", e("a").attr("href")).group(1)) if id not in ids and id not in subids: logging.debug('Récupération : sujet %d', id) f = e.parents().eq(-2) locked = u"verrouillé" in f("td img").eq(0).attr("alt") views = int(f("td").eq(5).text()) subtopics.append({'id': id, 'type': e("strong").text(), 'parent': forum['newid'], 'title': e("a").text(), 'locked': locked, 'views': views, 'parsed': False}) subids.append(id) n += 1 progress.update(n) else: logging.warning('Le sujet %d a déjà été récupéré.', id) save.topics.extend(subtopics) ids.extend(subids) [i for i in save.forums if i == forum][0]["parsed"] = True progress.end()
def parsea_data(raw): d = PyQuery(raw) elems = d('div.mb4.w-100.w-25-l.w-50-m') for el in elems.items(): titleEl = PyQuery(el.find("h3.mt3.mb0.b")) imgEl = PyQuery(el.find("div[data-bg-src]")) groupEl = PyQuery(el.find("span.f7")) userEl = PyQuery(el.find("span.f7>a")) # title title = titleEl.text() # img image = imgEl.attr("data-bg-src") # get group byIdx = groupEl.text().index(" By ") group = groupEl.text()[:byIdx] # user user_name = userEl.text() user_link = userEl.attr("href") # type icon = PyQuery(el.find("i.material-icons.v-mid")) print(icon.next()) if icon.text() == 'timelapse': type = 'sales' else: type = 'subscription' info = icon.parents('span').text() info = info.replace('timelapse', '') info = info.replace('all_inclusive', '') # price priceEl = PyQuery( el.find( "div.w-100.absolute.bottom-0.mb3.black>div:nth-of-type(3)")) price = priceEl.text() list.append({ 'title': title, 'image': image, 'group': group, 'user': { 'name': user_name, 'link': user_link }, 'type': type, 'info': info, 'price': price })
def _export_(self): self.logger.debug("Récupération du forum %s (page %d)", self.forum.oldid, self.page) # Download the page response = self.session.get("/{}p{}-a".format(self.forum.oldid, self.page)) document = PyQuery(response.text) # Get the topics for element in document.find("div.topictitle"): e = PyQuery(element) topic_id = int(re.search(r"/t(\d+)-.*", clean_url(e("a").attr("href"))).group(1)) if topic_id not in self.announcements: f = e.parents().eq(-2) locked = 1 if ("verrouillé" in f("td img").eq(0).attr("alt")) else 0 views = int(f("td").eq(5).text()) topic_type = TOPIC_TYPES.get(e("strong").text(), 0) title = e("a").text() self.add_child(Topic(topic_id, topic_type, title, locked, views)) if topic_type >= 2: # The topic is an announcement, save its id to avoid exporting it again self.announcements.append(topic_id)
def get_topics(): logging.info("Récupération des sujets") if config.debug: progress = progressbar.NoProgressBar() else: progress = progressbar.ProgressBar( widgets=[progressbar.SimpleProgress("/"), " ", progressbar.Bar("#", "[", "]"), progressbar.Percentage()], maxval=save.nbtopics, ) progress.start() n = len(save.topics) ids = [i["id"] for i in save.topics] for forum in [i for i in save.forums if (i["type"] == "f" and i["parsed"] == False)]: logging.debug("Récupération : sujets du forum %d", forum["id"]) subtopics = [] subids = [] d = PyQuery(url=config.rooturl + "/" + forum["type"] + str(forum["id"]) + "-a", opener=fa_opener) result = re.search( "function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}", d.text(), ) try: pages = int(result.group(1)) topicsperpages = int(result.group(2)) except: pages = 1 topicsperpages = 0 for page in range(0, pages): if page >= 1: d = PyQuery( url=config.rooturl + "/" + forum["type"] + str(forum["id"]) + "p" + str(page * topicsperpages) + "-a", opener=fa_opener, ) for i in d.find("div.topictitle"): e = PyQuery(i) id = int(re.search("/t(\d+)-.*", e("a").attr("href")).group(1)) if id not in ids and id not in subids: logging.debug("Récupération : sujet %d", id) f = e.parents().eq(-2) locked = u"verrouillé" in f("td img").eq(0).attr("alt") views = int(f("td").eq(5).text()) subtopics.append( { "id": id, "type": e("strong").text(), "parent": forum["newid"], "title": e("a").text(), "locked": locked, "views": views, "parsed": False, } ) subids.append(id) n += 1 progress.update(n) else: logging.warning("Le sujet %d a déjà été récupéré.", id) save.topics.extend(subtopics) ids.extend(subids) [i for i in save.forums if i == forum][0]["parsed"] = True progress.end()
def parse_items(self, urls): docs = [] threads = [ threading.Thread(target=get, args=(url, docs)) for url in urls ] for thread in threads: thread.start() for thread in threads: thread.join() for item_doc in docs: word_id = None match = re.search("notSatisfied(Lang)?\( ?'(\d+)' ?[,\)]", item_doc.html()) if match: word_id = match.group(2) for locale in item_doc("article.pronunciations"): locale = PyQuery(locale) lang_header = locale('header[id=%s]' % self.lang.split('_')[0]) if lang_header: word = re.compile(r"(.*) の発音").search( lang_header.text()).group(1) if self.lang == 'en_usa': els = locale('header[id=%s]' % self.lang).next_all() else: els = locale('.show-all-pronunciations li') lis = [] for el in els: el = PyQuery(el) if el.has_class('li-ad'): continue if el.is_('header'): break lis.append(el) for li in lis: i = PyQuery(li('span.play')) text = i.parents('li').eq(0).text() user = None match = re.search("発音したユーザ: (.*) \(", text) if match: user = match.group(1) onclick = i.attr('onclick') match = re.compile(r"Play\(.*,'(.*)',.*,.*,.*,.*,.*\)" ).search(onclick) if match: code = match.group(1) url = 'https://audio00.forvo.com/mp3/' + \ base64_decode(code) self.results.append({ 'word': word, 'url': url, 'word_id': word_id, 'user': user }) else: match = re.compile( r"PlayPhrase\(.*,'(.*)',.*\)").search(onclick) if match: code = match.group(1) url = 'https://audio00.forvo.com/phrases/mp3/' + \ base64_decode(code) self.results.append({ 'word': word, 'url': url, 'word_id': word_id, 'user': user })