def get_cats_thread(self, cat): print cat['id'] subcats = self.get_sub_cats('http://top.taobao.com/level2.php?cat=%s'%cat['id'], 'cat', 2) if len(subcats) == 1: cat['children'] = self.get_sub_cats_thread(subcats[0]) return cat threadPool = ThreadPool(len(subcats) if len(subcats)<=5 else 5) for sc in subcats: threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc) cat['children'] = threadPool.killAllWorkers(None) return cat
def get_cats(self): '''Get top keywords categories''' start_url = 'http://top.taobao.com/index.php?from=tbsy' rs = self.fetch(start_url) if not rs: return None soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) cats = [{'id':'TR_%s'%li['id'].encode('utf-8').upper(), 'title':li.a.text.encode('utf-8').strip()} for li in soup.find('div', id='nav').findAll('li') if li['id']!='index'] threadPool = ThreadPool(len(cats) if len(cats)<=5 else 5) for cat in cats: threadPool.run(self.get_cats_thread, callback=None, cat=cat) cats = threadPool.killAllWorkers(None) return cats
def get_top_keywords(self, cats=None, parent=None, up=True): '''Get top keywords for all the categories''' if not cats: cats = self.get_cats() if not cats: return [] threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5) for cat in cats: threadPool.run(self.cat_top_keywords_thread, callback=None, cat=cat, parent=parent, up=up) cats = threadPool.killAllWorkers(None) return cats
def cat_top_keywords(self, session, cat, up=True, offset=0, offsets=[]): '''Get top keywords in a specific category''' print 'CAT:%s, level:%s'%(str(cat), str(cat.level)) print 'OFFSET: %d'%offset response = [] if not offsets or offset==0: url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', offset) print url rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) response = self.parse_cat_top_keywords(soup, offset) if offset==0: offsets = self.get_cat_top_keywords_pages(soup, offset) print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets)<=5 else 5) for idx, page_offset in enumerate(offsets): page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', page_offset) next_page = 'True' if idx == (len(offsets)-1) else 'False' threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)) pages = threadPool.killAllWorkers(None) #print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) offset2 = int(p.config['offset']) response += self.parse_cat_top_keywords(soup2, offset2) print 'GOT: %d'%offset2 if p.config['get_next'] != 'True': continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) print offsets if not offsets: continue response += self.cat_top_keywords(session, cat, up, offset2, offsets) #return sorted(response, key=itemgetter('pos')) if response else [] #print "RETURN:%d"%offset for k in response: new_keyword = models.Keyword(k['name'].decode('utf-8')) new_keyword.categories.append(cat) session.add(new_keyword) try: session.commit() except IntegrityError: session.rollback() new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k['name']).first() new_keyword.categories.append(cat) session.commit() print 'Duplicate %s'%new_keyword return response
def cat_top_keywords(self, cat, level3='', up=True, offset=0, offsets=[]): '''Get top keywords in a specific category''' #print 'CAT:%s, level:%s'%(str(cat), str(level3)) #print 'OFFSET: %d'%offset response = [] if not offsets or offset == 0: url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % ( str(cat), str(level3), 'true' if up else '', offset) rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content) response = self.parse_cat_top_keywords(soup, offset) if offset == 0: offsets = self.get_cat_top_keywords_pages(soup, offset) #print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5) for idx, page_offset in enumerate(offsets): page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % ( str(cat), str(level3), 'true' if up else '', page_offset) next_page = 'True' if idx == (len(offsets) - 1) else 'False' threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)) pages = threadPool.killAllWorkers(None) #print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content) offset2 = int(p.config['offset']) response += self.parse_cat_top_keywords(soup2, offset2) #print 'GOT: %d'%offset2 if p.config['get_next'] != 'True': continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) #print offsets if not offsets: continue response += self.cat_top_keywords(cat, level3, up, offset2, offsets) #return sorted(response, key=itemgetter('pos')) if response else [] #print "RETURN:%d"%offset return response
def start_ui(test=False): from PyQt5.QtWidgets import QApplication import sys from src.ui.tab_reorder import TabReorder from src.ui.tab_log import TabLog from src.ui.tab_config import TabConfig from src.ui.tab_skins import TabSkins logger.debug('starting QtApp object') global_.QT_APP = QApplication([]) global_.MAIN_UI = MainUi() global_.MAIN_UI.add_tab(TabLog(), helpers={'write_log': 'write'}) global_.MAIN_UI.add_tab(TabReorder(), helpers={ 'tab_reorder_update_view_after_remote_scan': 'tab_reorder_update_view_after_remote_scan' }) from src.misc import dcs_installs dcs_installs.discover_dcs_installations() global_.MAIN_UI.add_tab(TabSkins(), helpers={}) global_.MAIN_UI.add_tab(TabConfig(), helpers={'update_config_tab': 'update_config_tab'}) global_.MAIN_UI.show() def pre_update_hook(): if not hasattr(sys, 'frozen'): logger.warning('skipping update on script run') return False else: I.hide() return True def cancel_update_hook(): I.show() from utils import Progress # noinspection PyTypeChecker Progress.register_adapter(I) from src.updater import updater updater.find_and_install_latest_release( current_version=global_.APP_VERSION, executable_path='emft.exe', channel=Config().update_channel, cancel_update_hook=cancel_update_hook, pre_update_hook=pre_update_hook, ) global_.MAIN_UI.update_config_tab() if test: logger.critical('RUNNING IN TEST MODE') import time from utils import ThreadPool, nice_exit def test_hook(): time.sleep(10) nice_exit() pool = ThreadPool(1, 'test') pool.queue_task(test_hook) sys.exit(global_.QT_APP.exec())