Пример #1
0
    def run(self):
        #start the works
        if self.website == settings.SINA_WEIBO:
            if self.ids_type == 'uid':
                sina_weibo.main(fetcher=self.fetcher,
                                uids=self.ids,
                                fetch_data=self.fetch_data,
                                store_path=self.store_path,
                                window=self.window)
            elif self.ids_type == 'msg_url':
                sina_weibo.main(fetcher=self.fetcher,
                                msg_urls=self.ids,
                                store_path=self.store_path,
                                window=self.window)

        elif self.website == settings.TWITTER:
            msg = 'For twitter, not implemented in current version.'
            self.window.write_logs(msg)
        elif self.website == settings.FACEBOOK:
            msg = 'For facebook, not implemented in current version.'
            self.window.write_logs(msg)
        else:
            msg = 'For %s, not implemented in current version.' % self.website
            self.window.write_logs(msg)

        #finished
        wx.CallAfter(self.window.finished)
 def run(self):
     #start the works
     if self.website == settings.COMWEIBO:
         if self.ids_type == 'uid':
             sina_weibo.main(fetcher=self.fetcher, uids=self.ids, 
                             fetch_data=self.fetch_data, 
                             store_path=self.store_path, 
                             window=self.window, weibo_com=True)
         elif self.ids_type == 'msg_url':
             sina_weibo.main(fetcher=self.fetcher, msg_urls=self.ids,
                             fetch_data=self.fetch_data,
                             store_path=self.store_path, 
                             window=self.window, weibo_com=True)
     elif self.website == settings.CNWEIBO:
         sina_weibo.main(fetcher=self.fetcher, uids=self.ids,
                         fetch_data=self.fetch_data,
                         window=self.window, weibo_com=False)
     elif self.website == settings.TWITTER:
         msg = 'For twitter, not implemented in current version.'
         wx.CallAfter(self.window.write_logs, str(msg))
     elif self.website == settings.FACEBOOK:
         msg = 'For facebook, not implemented in current version.'
         wx.CallAfter(self.window.write_logs, str(msg))
     else:
         msg = 'For %s, not implemented in current version.' %self.website
         wx.CallAfter(self.window.write_logs, str(msg))
     
     #finished
     wx.CallAfter(self.window.finished)
Пример #3
0
from thread_pool import WorkerManager

fetcher = ComWeiboFetcher(username=account.user, password=account.pwd)

login_ok = fetcher.check_cookie()

if not login_ok:
    print 'login failed.'
    sys.exit()

fans = []
follows = []

sw.main(fetcher,
        fetch_data='follows',
        store_path='./file/',
        uids=memstorage.users_id_moniterd,
        uids_storage=follows)
sw.main(fetcher,
        fetch_data='fans',
        store_path='./file/',
        uids=memstorage.users_id_moniterd,
        uids_storage=fans)

friends_list = list(set(fans) | set(follows))

print friends_list
#host's weibo
sw.main(fetcher,
        fetch_data='weibos',
        store_path='./file/',
Пример #4
0
import memstorage
import account
from thread_pool import WorkerManager

fetcher = ComWeiboFetcher(username=account.user, password=account.pwd)

login_ok = fetcher.check_cookie()

if not login_ok:
    print 'login failed.'
    sys.exit()

fans = []
follows = []

sw.main(fetcher, fetch_data='follows', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=follows)
sw.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=fans)

friends_list = list(set(fans)|set(follows))

print friends_list
#host's weibo
sw.main(fetcher,fetch_data='weibos',store_path='./file/',uids=memstorage.users_id_moniterd)
#friends' weibo
n_threads = 10
n_paritions = 10
len_partition = len(friends_list)/n_paritions

worker_manager = WorkerManager(n_threads)
for i in range(0,len(friends_list),len_partition):
	worker_manager.add_job(sw.main, fetcher, fetch_data='weibos',store_path='./file/',
Пример #5
0
    def do_task(self):
        '''task file format:
            task_id:**(time format) time.strftime('%Y-%m-%d-%H-%M', time.localtime())
            id_type:**(uid/msg_url)
            fetch_data: weibos/follows/fans/infos
            uids:(separated by semicolon)
            msg_urls:(separated by semicolon)
        '''

        task_id = ''
        id_type = 'uid'
        fetch_data = 'infos'
        uids = []
        msg_urls = []

        tar_file = None

        f_task = os.path.join(TASK_PATH, 'task.dat')
        if os.path.exists(f_task):
            fp = codecs.open(f_task, 'r', 'utf-8')

            data = fp.readlines()

            #parse
            for line in data:
                line = line.strip()

                if line.startswith('task_id:'):
                    task_id = line.split('task_id:')[-1]
                elif line.startswith('id_type:'):
                    id_type = line.split('id_type:')[-1]
                elif line.startswith('fetch_data:'):
                    fetch_data = line.split('fetch_data:')[-1]
                    fetch_data = fetch_data.lower()
                elif line.startswith('uids:'):
                    _uids = line.split('uids:')[-1]
                    _uids = _uids.split(';')
                    _uids = [
                        uid.strip().encode('utf-8') for uid in _uids
                        if len(uid) > 0
                    ]

                    for uid in uids:
                        try:
                            int(uid)
                            uids.append(uid)
                        except:
                            pass

                elif line.startswith('msg_urls:'):
                    _msg_urls = line.split('msg_urls:')[-1]
                    _msg_urls = _msg_urls.split(';')

                    p = re.compile(r'^http[s]?://weibo.com/\d*/[A-Za-z0-9]+$',
                                   re.U)
                    for msg_url in _msg_urls:
                        try:
                            msg_url = p.search(msg_url).group(0)
                            msg_urls.append(msg_url)
                        except:
                            pass
                else:
                    msg = 'Task format error.'
                    logger.info(msg)
                    wx.CallAfter(self.window.write_logs, msg)

            #start
            if id_type == 'uid' and len(uids) > 0:
                sina_weibo.main(fetcher=self.fetcher,
                                fetch_data=fetch_data,
                                uids=uids,
                                store_path=self.store_path,
                                window=self.window)

                files = os.listdir(self.store_path)
                files = filter(
                    lambda f: fetch_data in f and f.endswith('.csv'), files)
            elif id_type == 'msg_url' and len(msg_urls) > 0:
                sina_weibo.main(fetcher=self.fetcher,
                                msg_urls=msg_urls,
                                store_path=self.store_path,
                                window=self.window)

                files = os.listdir(self.store_path)
                files = filter(
                    lambda f: 'reposts' in f or 'comments' in f and f.endswith(
                        '.csv'), files)

            #compress and upload
            if len(files) > 0:
                tar_f = str(
                    self.host_fetcher.username) + str(task_id) + '.tar.gz'
                tar_f = os.path.join(self.store_path, tar_f)

                tar_file = tarfile.open(tar_f, 'w:bz2')
                tar_file.add(f_task, arcname='task.dat')

                for f in files:
                    f_name = os.path.join(self.store_path, f)
                    tar_file.add(f_name, arcname=f)

                    os.rename(f_name, os.path.join(self.upload_path, f))

                tar_file.close()

            os.remove(os.path.join(self.store_path, f_task))

        return tar_file
Пример #6
0
# encoding: utf-8


from sina_weibo.fetcher import ComWeiboFetcher
import sina_weibo
import sys
import time
import memstorage

user = '******'
pwd = 'ecnupass'

fetcher = ComWeiboFetcher(username=user, password=pwd)

login_ok = fetcher.check_cookie()

if not login_ok:
    print 'login failed.'
    sys.exit()

sina_weibo.main(fetcher,fetch_data='weibos',store_path='./file/',uids=memstorage.users_id_moniterd)

 def do_task(self):
     '''task file format:
         task_id:**(time format) time.strftime('%Y-%m-%d-%H-%M', time.localtime())
         id_type:**(uid/msg_url)
         fetch_data: weibos/follows/fans/infos
         uids:(separated by semicolon)
         msg_urls:(separated by semicolon)
     '''
     
     task_id    = ''
     id_type    = 'uid'
     fetch_data = 'infos'
     uids       = []
     msg_urls   = []
     
     tar_file = None
         
     f_task = os.path.join(TASK_PATH, 'task.dat')
     if os.path.exists(f_task):
         fp = codecs.open(f_task, 'r', 'utf-8')
             
         data = fp.readlines()
             
         #parse
         for line in data:
             line = line.strip()
                 
             if line.startswith('task_id:'):
                 task_id = line.split('task_id:')[-1]
             elif line.startswith('id_type:'):
                 id_type = line.split('id_type:')[-1]
             elif line.startswith('fetch_data:'):
                 fetch_data = line.split('fetch_data:')[-1]
                 fetch_data = fetch_data.lower()
             elif line.startswith('uids:'):
                 _uids = line.split('uids:')[-1]
                 _uids = _uids.split(';')
                 _uids = [uid.strip().encode('utf-8') for uid in _uids if len(uid) > 0]
                     
                 for uid in uids:
                     try:
                         int(uid)
                         uids.append(uid)
                     except:
                         pass
                     
             elif line.startswith('msg_urls:'):
                 _msg_urls = line.split('msg_urls:')[-1]
                 _msg_urls = _msg_urls.split(';')
                     
                 p = re.compile(r'^http[s]?://weibo.com/\d*/[A-Za-z0-9]+$', re.U)
                 for msg_url in _msg_urls:
                     try:
                         msg_url = p.search(msg_url).group(0)
                         msg_urls.append(msg_url)
                     except:
                         pass
             else:
                 msg = 'Task format error.'
                 logger.info(msg)
                 wx.CallAfter(self.window.write_logs, msg)
                 
             
         #start
         if id_type == 'uid' and len(uids) > 0:
             sina_weibo.main(fetcher=self.fetcher, fetch_data=fetch_data,
                             uids=uids, store_path=self.store_path, 
                             window=self.window)
                 
             files = os.listdir(self.store_path)
             files = filter(lambda f: fetch_data in f and f.endswith('.csv'), files)
         elif id_type == 'msg_url' and len(msg_urls) > 0:
             sina_weibo.main(fetcher=self.fetcher, msg_urls=msg_urls,
                             store_path=self.store_path, window=self.window)
                 
             files = os.listdir(self.store_path)
             files = filter(lambda f: 'reposts' in f or 'comments' in f and f.endswith('.csv'), files)
                 
         #compress and upload
         if len(files) > 0:
             tar_f = str(self.host_fetcher.username) + str(task_id) + '.tar.gz'
             tar_f = os.path.join(self.store_path, tar_f)
                 
             tar_file = tarfile.open(tar_f, 'w:bz2')
             tar_file.add(f_task, arcname='task.dat')
                 
             for f in files:
                 f_name = os.path.join(self.store_path, f)
                 tar_file.add(f_name, arcname=f)
                     
                 os.rename(f_name, os.path.join(self.upload_path, f)) 
                     
             tar_file.close()
                 
         os.remove(os.path.join(self.store_path, f_task))
         
     return tar_file
Пример #8
0
# encoding: utf-8

from sina_weibo.fetcher import ComWeiboFetcher
import sina_weibo
import sys
import time
import memstorage

user = '******'
pwd = 'ecnupass'
fetcher = ComWeiboFetcher(username=user, password=pwd)

login_ok = fetcher.check_cookie()

if not login_ok:
    print 'login failed.'
    sys.exit()

sina_weibo.main(fetcher,
                fetch_data='weibos',
                store_path='./file/',
                uids=memstorage.users_id_moniterd,
                weibos_storage=memstorage.weibos_url_moniterd)
Пример #9
0
def TestWeibo__init__(user, pwd, weibo_com):

    if weibo_com:
        fetcher = ComWeiboFetcher(username=user, password=pwd)
    else:
        fetcher = CnWeiboFetcher(username=user, password=pwd)

    login_ok = fetcher.check_cookie()
    if not login_ok:
        print "login failed."
        sys.exit()

    uids = [1000000253, 10057, 10029]

    msg_urls = ["http://weibo.com/1000000253/ezC36cq3i6G", "http://weibo.com/1713926427/A2V5CENGU"]

    start = time.time()

    print "crawl weibos"
    sina_weibo.main(fetcher, fetch_data="weibos", store_path="./file/", uids=uids, weibo_com=weibo_com)

    print "crawl follows"
    sina_weibo.main(fetcher, fetch_data="follows", store_path="./file/", uids=uids, weibo_com=weibo_com)

    print "crawl fans"
    sina_weibo.main(fetcher, fetch_data="fans", store_path="./file/", uids=uids, weibo_com=weibo_com)

    print "crawl infos"
    sina_weibo.main(fetcher, fetch_data="infos", store_path="./file/", uids=uids, weibo_com=weibo_com)

    print "crawl reposts"
    sina_weibo.main(fetcher, store_path="./file/", msg_urls=msg_urls, fetch_data="repost", weibo_com=weibo_com)

    print "crawl comments"
    sina_weibo.main(fetcher, store_path="./file/", msg_urls=msg_urls, fetch_data="comment", weibo_com=weibo_com)

    cost_time = int(time.time() - start)
    print "finished: # connections: %s, cost time: %s" % (fetcher.n_connections, cost_time)
Пример #10
0
def TestWeibo__init__(user, pwd, weibo_com):

    if weibo_com:
        fetcher = ComWeiboFetcher(username=user, password=pwd)
    else:
        fetcher = CnWeiboFetcher(username=user, password=pwd)

    login_ok = fetcher.check_cookie()
    if not login_ok:
        print 'login failed.'
        sys.exit()

    uids = [1000000253, 10057, 10029]

    msg_urls = [
        'http://weibo.com/1000000253/ezC36cq3i6G',
        'http://weibo.com/1713926427/A2V5CENGU'
    ]

    start = time.time()

    print 'crawl weibos'
    sina_weibo.main(fetcher,
                    fetch_data='weibos',
                    store_path='./file/',
                    uids=uids,
                    weibo_com=weibo_com)

    print 'crawl follows'
    sina_weibo.main(fetcher,
                    fetch_data='follows',
                    store_path='./file/',
                    uids=uids,
                    weibo_com=weibo_com)

    print 'crawl fans'
    sina_weibo.main(fetcher,
                    fetch_data='fans',
                    store_path='./file/',
                    uids=uids,
                    weibo_com=weibo_com)

    print 'crawl infos'
    sina_weibo.main(fetcher,
                    fetch_data='infos',
                    store_path='./file/',
                    uids=uids,
                    weibo_com=weibo_com)

    print 'crawl reposts'
    sina_weibo.main(fetcher,
                    store_path='./file/',
                    msg_urls=msg_urls,
                    fetch_data='repost',
                    weibo_com=weibo_com)

    print 'crawl comments'
    sina_weibo.main(fetcher,
                    store_path='./file/',
                    msg_urls=msg_urls,
                    fetch_data='comment',
                    weibo_com=weibo_com)

    cost_time = int(time.time() - start)
    print 'finished: # connections: %s, cost time: %s' % (
        fetcher.n_connections, cost_time)
Пример #11
0
import sys
import time
import memstorage


fetcher = ComWeiboFetcher(username=memstorage.user, password=memstorage.pwd)

login_ok = fetcher.check_cookie()

if not login_ok:
    print 'login failed.'
    sys.exit()

start = time.time()

sina_weibo.main(fetcher, fetch_data='follows', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=memstorage.uids_url_moniterd)

#sina_weibo.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=memstorage.uids_url2_moniterd)

#a = set(memstorage.uids_url_moniterd)
#b = set(memstorage.uids_url2_moniterd)

#print a & b

#c = list(a&b)
sina_weibo.main(fetcher, fetch_data='weibos', store_path='./file/', uids=memstorage.uids_url_moniterd, weibos_storage=memstorage.weibos_url_moniterd)

print 'crawl reposts and comments'
 
sina_weibo.main(fetcher, store_path='./file/', msg_urls=memstorage.weibos_url_moniterd)
    def do_task(self):
        """task file format:
            task_id:**(time format) time.strftime('%Y-%m-%d-%H-%M', time.localtime())
            id_type:**(uid/msg_url)
            fetch_data: weibos/follows/fans/infos
            uids:(separated by semicolon)
            msg_urls:(separated by semicolon)
        """

        task_id = ""
        id_type = "uid"
        fetch_data = "infos"
        uids = []
        msg_urls = []

        tar_file = None

        f_task = os.path.join(TASK_PATH, "task.dat")
        if os.path.exists(f_task):
            fp = codecs.open(f_task, "r", "utf-8")

            data = fp.readlines()

            # parse
            for line in data:
                line = line.strip()

                if line.startswith("task_id:"):
                    task_id = line.split("task_id:")[-1]
                elif line.startswith("id_type:"):
                    id_type = line.split("id_type:")[-1]
                elif line.startswith("fetch_data:"):
                    fetch_data = line.split("fetch_data:")[-1]
                    fetch_data = fetch_data.lower()
                elif line.startswith("uids:"):
                    _uids = line.split("uids:")[-1]
                    _uids = _uids.split(";")
                    _uids = [uid.strip().encode("utf-8") for uid in _uids if len(uid) > 0]

                    for uid in uids:
                        try:
                            int(uid)
                            uids.append(uid)
                        except:
                            pass

                elif line.startswith("msg_urls:"):
                    _msg_urls = line.split("msg_urls:")[-1]
                    _msg_urls = _msg_urls.split(";")

                    p = re.compile(r"^http[s]?://weibo.com/\d*/[A-Za-z0-9]+$", re.U)
                    for msg_url in _msg_urls:
                        try:
                            msg_url = p.search(msg_url).group(0)
                            msg_urls.append(msg_url)
                        except:
                            pass
                else:
                    msg = "Task format error."
                    logger.info(msg)
                    wx.CallAfter(self.window.write_logs, msg)

            # start
            if id_type == "uid" and len(uids) > 0:
                sina_weibo.main(
                    fetcher=self.fetcher,
                    fetch_data=fetch_data,
                    uids=uids,
                    store_path=self.store_path,
                    window=self.window,
                )

                files = os.listdir(self.store_path)
                files = filter(lambda f: fetch_data in f and f.endswith(".csv"), files)
            elif id_type == "msg_url" and len(msg_urls) > 0:
                sina_weibo.main(fetcher=self.fetcher, msg_urls=msg_urls, store_path=self.store_path, window=self.window)

                files = os.listdir(self.store_path)
                files = filter(lambda f: "reposts" in f or "comments" in f and f.endswith(".csv"), files)

            # compress and upload
            if len(files) > 0:
                tar_f = str(self.host_fetcher.username) + str(task_id) + ".tar.gz"
                tar_f = os.path.join(self.store_path, tar_f)

                tar_file = tarfile.open(tar_f, "w:bz2")
                tar_file.add(f_task, arcname="task.dat")

                for f in files:
                    f_name = os.path.join(self.store_path, f)
                    tar_file.add(f_name, arcname=f)

                    os.rename(f_name, os.path.join(self.upload_path, f))

                tar_file.close()

            os.remove(os.path.join(self.store_path, f_task))

        return tar_file