示例#1
0
文件: crawler.py 项目: sasakuma/pyTOP
 def get_cats_thread(self, cat):
     print cat['id']
     subcats = self.get_sub_cats('http://top.taobao.com/level2.php?cat=%s'%cat['id'], 'cat', 2)
     if len(subcats) == 1:
         cat['children'] = self.get_sub_cats_thread(subcats[0])
         return cat
     threadPool = ThreadPool(len(subcats) if len(subcats)<=5 else 5)
     for sc in subcats:
         threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc)
     cat['children'] = threadPool.killAllWorkers(None)
     return cat
示例#2
0
文件: crawler.py 项目: sasakuma/pyTOP
 def get_cats(self):
     '''Get top keywords categories'''
     start_url = 'http://top.taobao.com/index.php?from=tbsy'
     rs = self.fetch(start_url)
     if not rs: return None
     soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage)
     cats = [{'id':'TR_%s'%li['id'].encode('utf-8').upper(), 'title':li.a.text.encode('utf-8').strip()} for li in soup.find('div', id='nav').findAll('li') if li['id']!='index']
     threadPool = ThreadPool(len(cats) if len(cats)<=5 else 5)
     for cat in cats:
         threadPool.run(self.get_cats_thread, callback=None, cat=cat)
     cats = threadPool.killAllWorkers(None)
     return cats
示例#3
0
 def get_top_keywords(self, cats=None, parent=None, up=True):
     '''Get top keywords for all the categories'''
     if not cats: cats = self.get_cats()
     if not cats: return []
     threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5)
     for cat in cats:
         threadPool.run(self.cat_top_keywords_thread,
                        callback=None,
                        cat=cat,
                        parent=parent,
                        up=up)
     cats = threadPool.killAllWorkers(None)
     return cats
示例#4
0
文件: crawler.py 项目: sasakuma/pyTOP
 def cat_top_keywords(self, session, cat, up=True,  offset=0, offsets=[]):
     '''Get top keywords in a specific category'''
     print 'CAT:%s, level:%s'%(str(cat), str(cat.level))
     print 'OFFSET: %d'%offset
     response = []
     if not offsets or offset==0: 
         url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', offset)
         print url
         rs = self.fetch(url)
         if not rs: return response
         soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage)
         response = self.parse_cat_top_keywords(soup, offset)
     if offset==0:
         offsets = self.get_cat_top_keywords_pages(soup, offset)
         print 'OFFSETS: %s'%offsets
     if offsets:
         rs = []
         threadPool = ThreadPool(len(offsets) if len(offsets)<=5 else 5)
         for idx, page_offset in enumerate(offsets):
             page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', page_offset)
             next_page = 'True' if idx == (len(offsets)-1) else 'False'
             threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset))
         pages = threadPool.killAllWorkers(None)
         #print 'RESPONSES: %s'%pages
         for p in pages:
             if not p: continue
             soup2 = BeautifulSoup(p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage)
             offset2 = int(p.config['offset'])
             response += self.parse_cat_top_keywords(soup2, offset2)
             print 'GOT: %d'%offset2
             if p.config['get_next'] != 'True': continue
             offsets = self.get_cat_top_keywords_pages(soup2, offset2)
             print offsets
             if not offsets: continue
             response += self.cat_top_keywords(session, cat, up, offset2, offsets)
     #return sorted(response, key=itemgetter('pos')) if response else []
     #print "RETURN:%d"%offset
     for k in response:
         new_keyword = models.Keyword(k['name'].decode('utf-8'))
         new_keyword.categories.append(cat)
         session.add(new_keyword)
         try:
             session.commit()
         except IntegrityError:
             session.rollback()
             new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k['name']).first()
             new_keyword.categories.append(cat)
             session.commit()
             print 'Duplicate %s'%new_keyword
     return response
示例#5
0
 def cat_top_keywords(self, cat, level3='', up=True, offset=0, offsets=[]):
     '''Get top keywords in a specific category'''
     #print 'CAT:%s, level:%s'%(str(cat), str(level3))
     #print 'OFFSET: %d'%offset
     response = []
     if not offsets or offset == 0:
         url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % (
             str(cat), str(level3), 'true' if up else '', offset)
         rs = self.fetch(url)
         if not rs: return response
         soup = BeautifulSoup(rs.content)
         response = self.parse_cat_top_keywords(soup, offset)
     if offset == 0:
         offsets = self.get_cat_top_keywords_pages(soup, offset)
         #print 'OFFSETS: %s'%offsets
     if offsets:
         rs = []
         threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5)
         for idx, page_offset in enumerate(offsets):
             page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % (
                 str(cat), str(level3), 'true' if up else '', page_offset)
             next_page = 'True' if idx == (len(offsets) - 1) else 'False'
             threadPool.run(self.fetch,
                            callback=None,
                            url=page_url,
                            config=dict(get_next=next_page,
                                        offset=page_offset))
         pages = threadPool.killAllWorkers(None)
         #print 'RESPONSES: %s'%pages
         for p in pages:
             if not p: continue
             soup2 = BeautifulSoup(p.content)
             offset2 = int(p.config['offset'])
             response += self.parse_cat_top_keywords(soup2, offset2)
             #print 'GOT: %d'%offset2
             if p.config['get_next'] != 'True': continue
             offsets = self.get_cat_top_keywords_pages(soup2, offset2)
             #print offsets
             if not offsets: continue
             response += self.cat_top_keywords(cat, level3, up, offset2,
                                               offsets)
     #return sorted(response, key=itemgetter('pos')) if response else []
     #print "RETURN:%d"%offset
     return response
示例#6
0
def start_ui(test=False):
    from PyQt5.QtWidgets import QApplication
    import sys
    from src.ui.tab_reorder import TabReorder
    from src.ui.tab_log import TabLog
    from src.ui.tab_config import TabConfig
    from src.ui.tab_skins import TabSkins
    logger.debug('starting QtApp object')
    global_.QT_APP = QApplication([])
    global_.MAIN_UI = MainUi()
    global_.MAIN_UI.add_tab(TabLog(), helpers={'write_log': 'write'})
    global_.MAIN_UI.add_tab(TabReorder(),
                            helpers={
                                'tab_reorder_update_view_after_remote_scan':
                                'tab_reorder_update_view_after_remote_scan'
                            })

    from src.misc import dcs_installs
    dcs_installs.discover_dcs_installations()

    global_.MAIN_UI.add_tab(TabSkins(), helpers={})

    global_.MAIN_UI.add_tab(TabConfig(),
                            helpers={'update_config_tab': 'update_config_tab'})
    global_.MAIN_UI.show()

    def pre_update_hook():
        if not hasattr(sys, 'frozen'):
            logger.warning('skipping update on script run')
            return False
        else:
            I.hide()
            return True

    def cancel_update_hook():
        I.show()

    from utils import Progress
    # noinspection PyTypeChecker
    Progress.register_adapter(I)

    from src.updater import updater

    updater.find_and_install_latest_release(
        current_version=global_.APP_VERSION,
        executable_path='emft.exe',
        channel=Config().update_channel,
        cancel_update_hook=cancel_update_hook,
        pre_update_hook=pre_update_hook,
    )

    global_.MAIN_UI.update_config_tab()

    if test:

        logger.critical('RUNNING IN TEST MODE')
        import time
        from utils import ThreadPool, nice_exit

        def test_hook():
            time.sleep(10)
            nice_exit()

        pool = ThreadPool(1, 'test')
        pool.queue_task(test_hook)

    sys.exit(global_.QT_APP.exec())