def _iterparse(self, h): """ @param h unicode @yield {kw} """ h = skstr.findbetween(h, '<div class="relative">', u'<!-- ▲メイン -->') if h: for m in self._rx_parse.finditer(h): brand = m.group(1) key = m.group(2) title = m.group(3) if key and title: try: key = int(key) except: key = 0 if key: yield { 'id': key, 'url': "https://www.melonbooks.co.jp/detail/detail.php?product_id=%s" % key, 'title': unescapehtml(title), 'brand': unescapehtml(brand), #'price': price, # price is not parsed here }
def _iterparse(self, h): """ @param h unicode @yield {kw} """ for m in self._rx_parse.finditer(h): hh = m.group() mm = self._rx_url_title.search(hh) if mm: url = mm.group(1) title = clean_title(unescapehtml(mm.group(2))) id = self._parseurlid(url) path = self._parseurlpath(url) if id and path: mm = self._rx_brand.search(hh) brand = unescapehtml(mm.group(1)) if mm else None mm = self._rx_img.search(hh) img = mm.group(1).replace('r.jpg', '.jpg') if mm else None mm = self._rx_price.search(hh) try: price = int(mm.group(1).replace(',', '')) except: price = 0 yield { 'url': "http://www.toranoana.jp" + url, # str not None 'id': id, 'title': title, # unicode not None 'image': img, # str or None 'brand': brand, # unicode or None 'price': price, # int not None }
def _iterparse(self, h): """ @param h unicode @yield {kw} """ try: start = h.find(u"検索結果(タイトル)") # int stop = h.find(u"ブランド別製品リスト") # int if start > 0 and stop > start: hh = h[start:stop] years = [] # [int year, int start] for m in self._rx_year.finditer(hh): years.append(( int(m.group(1)), m.start(), )) if not years: dwarn("cannot find release years, maybe, unknown years") id0 = title0 = None # first, yield the matched game m = self._rx_first_id.search(hh) if m: id0 = int(m.group(1)) if id0: m = self._rx_first_title.search(hh) if m: title0 = unescapehtml(m.group(1)) if id0 and title0: year0 = years[0][0] if years else None brand0 = self._parsebrand(h) yield { 'id': id0, 'title': title0, 'date': self._parsedate(h), 'brand': brand0, 'year': year0, } # then, parse index of years # iterparse and compare index against year index for m in self._rx_product.finditer(hh): id = int(m.group(1)) title = unescapehtml(m.group(2)) year = None if years: for y, start in years: if start > m.start(): break year = y yield {'id': id, 'title': title, 'year': year} except ValueError: # raised by int() dwarn("failed to convert to int")
def _iterparsewriters(self, h): """ @param h unicode html @yield unicode """ m = self._rx_info_writers.search(h) if m: line = unescapehtml(m.group(1)) for m in self._rx_staff.finditer(line): yield unescapehtml(m.group(1))
def t_unicode(t): """ @param t str @return unicode or None """ return unescapehtml(t).decode('utf8', errors='ignore').strip() if t else None
def translate(text, to='en', fr='ja'): """Return translated text, which is NOT in unicode format @param text unicode not None @param fr unicode not None, must be valid language code @param to unicode not None, must be valid language code @return unicode or None """ try: r = session.get( api(to, fr), headers=GZIP_HEADERS, # disabled since not supported by qt params={'before': text}) #print r.headers['Content-Type'] ret = r.content if r.ok and len(ret) > 1000: # Extract text within '<textarea .*name="after">' and '</textarea>' m = __re_search.search(ret) if m: ret = m.group(1) ret = ret.decode('utf8', errors='ignore') ret = unescapehtml(ret) else: dwarn("content not matched: %s" % ret) return ret #except socket.error, e: # dwarn("socket error", e.args) except requests.ConnectionError, e: dwarn("connection error", e.args)
def _parsejson(self, data): """@reimp @param data @return {kw} @raise """ items = data['items'] for item in items: if item['romanTitle'] == '::inedited:: ': item['romanTitle'] = '' f = _PATCHES.get(item['id']) if f: for k, v in f.iteritems(): item[k] = v for k in 'title', 'romanTitle', 'brand': t = item[k] if t: item[k] = unescapehtml( t).rstrip() # remove right most space t = item.get('releaseDayNumber') if t and isinstance(t, int): s = "%s" % t if s.endswith('44'): # date number should not ends with > 31 t -= 44 - 28 # change to 28 item['releaseDayNumber'] = t return items
def _iterparsebrands(self, h): """ @param h unicode html @yield {kw} """ try: m = self._rx_brands.search(h) if m: line = m.group(1) for hh in line.split(u'、'): id = int(self._rx_brands_id.search(hh).group(1)) name = unescapehtml( self._rx_brands_name.search(hh).group(1)) yield { 'id': id, # int 'name': name, # unicode 'img': "http://media.erogetrailers.com/img/brand/%i.png" % id, # str #'url': "http://erogetrailers.com/brand/%i" % id, # not used } except Exception, e: dwarn(e)
def _parsemetadesc(self, h): """ @param h unicode html @return kw """ ret = {} m = self._rx_meta_desc.search(h) if m: desc = m.group(1) m = self._rx_desc_title.search(desc) if m: ret['title'] = unescapehtml(m.group(1)) #m = self._rx_desc_brand.search(desc) #if m: # brand = unescapehtml(m.group(1)) # if brand[-1] == u'の': # brand = brand[:-1] # ret['brand'] = brand m = self._rx_desc_price.search(desc) if m: price = m.group(1).replace(',', '') try: ret['price'] = int(price) except: pass return ret
def translate(text, to='en', fr='ja'): """Return translated text, which is NOT in unicode format @param text unicode not None @param* fr unicode not None, must be valid language code @param* to unicode not None, must be valid language code @return unicode or None Returned text is not decoded, as its encoding can be guessed. """ try: JSONP_CALLBACK = 'ret' r = session.post( HONYAKU_API, headers=GZIP_HEADERS, data={ 'SSRC': text, 'SLANG': niftydef.nifty_lang(fr), 'TLANG': niftydef.nifty_lang(to), #'txtDirection': fr + to, # not needed #'XMODE': 0, # not needed }, ) ret = r.content # Example: ret('', {"translatedText":"If you can be calm, true-kun or 210 Yen."}, 200, null, null); if r.ok: ret = skstr.findbetween(ret, HONYAKU_TEXT_START, HONYAKU_TEXT_STOP) ret = ret.decode('utf8', errors='ignore') ret = skstr.unescapehtml(ret) return ret #except socket.error, e: # dwarn("socket error", e.args) except requests.ConnectionError, e: dwarn("connection error", e.args)
def translate(self, t, to='auto', fr='auto'): """ @param t unicode @param* to str @param* fr str @return unicode or None """ try: r = self.session.post(self.api, headers=self.headers, data={ 'hl': googledef.lang2locale(to), 'sl': googledef.lang2locale(fr), 'q': t, }) h = r.content if h: start = h.find(self._TEXT_BEGIN) if start > 0: start += len(self._TEXT_BEGIN) stop = h.find(self._TEXT_END, start) if stop > 0: h = h[start:stop] return unescapehtml(h) #except socket.error, e: # dwarn("socket error", e.args) except requests.ConnectionError, e: dwarn("connection error", e.args)
def _parsetitle(self, h): """ @param h unicode html @return unicode or None """ t = self._parsemeta(self._rx_meta_title, h) if t: return unescapehtml(self._rx_title.sub('', t)).strip()
def _parsebanner(self, h): """ @param h unicode html @return unicode or None """ m = self._rx_banner.search(h) if m: return unescapehtml(m.group(1))
def _parsetitle(self, h): """ @param h unicode html @return unicode """ kw = self._parsemetakw(h) if kw: return unescapehtml(kw[0])
def _parsetitle(self, h): """ @param h unicode @return unicode """ m = self._re_title.search(h) if m: return unescapehtml(m.group(1))
def _parsecomment(self, h): """ @param h unicode html @return unicode """ m = self._rx_comment.search(h) if m: return unescapehtml(m.group(1))
def _parsetitle(self, h): """ @param h unicode html @return unicode """ r = skstr.findbetween(h, 'width:auto;">', '</strong>') if r and '<' not in r: return unescapehtml(r)
def _parsekeywords(self, h): """ @param h unicode html @return [unicode] or None """ t = self._parsemeta(self._rx_meta_keywords, h) if t: return unescapehtml(t).split(',')
def _parseseries(self, h): """ @param h unicode html @return unicode or None """ m = self._rx_series.search(h) if m: return unescapehtml(m.group(1))
def _unescape_term_text(text): """ @param text unicode @return unicode """ if not text or '&' not in text or ';' not in text: return text return skstr.unescapehtml(text).replace('&eos;', defs.TERM_ESCAPE_EOS)
def _iterparsedescriptions(self, h): """ @param h unicode html @yield unicode """ for m in self._rx_desc.finditer(h): yield unescapehtml( self._removescripts(self._replacelinks(m.group())))
def _parsedate(self, h): """ @param h unicode html @return unicode or None """ m = self._rx_info_date.search(h) if m: return unescapehtml(m.group(1))
def _parsetd(self, rx, h): """ @param h unicode html @return unicode or None """ m = rx.search(h) if m: return unescapehtml(m.group(1))
def _iterparseddlinks(self, *args, **kwargs): """ @yield unicode """ dd = self._parsedd(*args, **kwargs); if dd: for m in self._rx_link.finditer(dd): yield unescapehtml(m.group(1))
def _parsetitle(self, h): """ @param h unicode html @return unicode or None """ t = self._parsemetakeyword(h) if t: return unescapehtml(t.partition(',')[0])
def _parsebrand(self, h): """ @param h unicode html @return unicode or None """ m = self._rx_brand.search(h) if m: return unescapehtml( m.group(1)).strip() # there is a space in the beginning
def _iterparsefields(self, h): """ @param h unicode @yield (str key, unicode or None) """ for k, rx in self._rx_fields: m = rx.search(h) if m: yield k, unescapehtml(m.group(1)).strip()
def _parsebrand(self, h): """ @param h unicode html @return unicode or None """ m = self._rx_brand.search(h) if m: return unescapehtml(m.group(1)).replace(" / ", ',').replace(u"/", ',')
def _iterparsecharacters(self, h): """ @param h unicode @yield kw """ m = self._rx_image.search(h) if m: prefix = m.group() for i in xrange(1, 100): img = "%sc%02d.jpg" % (prefix, i) start = h.find(img) if start < 0: break stop = h.find('</table>', start) if start < 0: break desc = h[start:stop] m = self._rx_label.search(desc) label = unescapehtml(m.group(1)) if m else '' m = self._rx_cv.search(desc) cv = unescapehtml(m.group(1)) if m else '' name = yomi = '' m = self._rx_chara.search(desc) if m: name = unescapehtml(m.group(1)).replace(u' ', ' ') # u3000 # ●羽馬 紫織(はば・しおり) beg = name.find(u'(') if beg > 0: end = name.rfind(u')') if end > 0: yomi = name[beg + 1:end].replace(u'・', ' ') name = name[:beg] yield { 'id': i, # int 'img': self.HOST + img, 'label': label, 'name': name.strip(), 'yomi': yomi.strip(), 'cv': cv, # unicode }
def _parseddlink(self, *args, **kwargs): """ @return unicode not None """ dd = self._parsedd(*args, **kwargs); if dd: m = self._rx_link.search(dd) if m: return unescapehtml(m.group(1)) return ''