示例#1
0
def fb_scrape(url):
    """
        Scripe facebook url, get likes, talking_about_count and checkins

        @type url: string
        @param url: facebook url

        @rtype:  dict
        @return: likes, talking_about_count and checkins dict data
    """
    data = {'likes': 0, 'talking_about_count': 0, 'checkins': 0}
    facebook_data = None
    if check_url(url, 'www.facebook.com'):
        facebook_id = get_facebook_id(url)
        try:
            facebook_data = facebook_api.request(facebook_id)
        except Exception as e:
            log.error(e)
            pass
    if facebook_data:
        if facebook_data.get('likes'):
            data['likes'] = facebook_data.get('likes')
        else:
            data['likes'] = 0
        if facebook_data.get('talking_about_count'):
            data['talking_about_count'] = facebook_data.get(
                'talking_about_count')
        else:
            data['talking_about_count'] = 0
        if facebook_data.get('were_here_count'):
            data['checkins'] = facebook_data.get('were_here_count')
        else:
            data['checkins'] = 0
    return data
示例#2
0
def fb_scrape(url):
    """
        Scripe facebook url, get likes, talking_about_count and checkins

        @type url: string
        @param url: facebook url

        @rtype:  dict
        @return: likes, talking_about_count and checkins dict data
    """
    data = {'likes': 0, 'talking_about_count': 0, 'checkins': 0}
    facebook_data = None
    if check_url(url, 'www.facebook.com'):
        facebook_id = get_facebook_id(url)
        try:
            facebook_data = facebook_api.request(facebook_id)
        except Exception as e:
            log.error(e)
            pass
    if facebook_data:
        if facebook_data.get('likes'):
            data['likes'] = facebook_data.get('likes')
        else:
            data['likes'] = 0
        if facebook_data.get('talking_about_count'):
            data['talking_about_count'] = facebook_data.get('talking_about_count')
        else:
            data['talking_about_count'] = 0
        if facebook_data.get('were_here_count'):
            data['checkins'] = facebook_data.get('were_here_count')
        else:
            data['checkins'] = 0
    return data
示例#3
0
def fblogin():
    try:
        fblogin = FacebookLogin(fb_username, fb_password)
        urllib2.install_opener(fblogin.opener)
    except Exception as e:
        log.error(e)
    pass
示例#4
0
def scrap_facebook_raw_data(url):
    data = {'likes': 0, 'talking_about_count': 0, 'checkins': 0}
    if check_url(url, 'www.facebook.com'):
        number_pat = "[0-9]+"
        stat_pat ='<div class="fsm fwn fcg"><div class="fsm fwn fcg">([0-9]+)(.*)([0-9]+)(.*)([0-9]+)(.*)\w+</div></div>'
        try:
            with closing(urllib2.urlopen(url=url, timeout=30)) as page:
                content = page.read()
                content= re.sub(',', '', content)
                result = re.search(stat_pat, content)
                if result:
                    #print result.group()
                    result = re.findall(number_pat, result.group())
                    if len(result)>=3:
                        data['likes']=int(result[0])
                        data['talking_about_count']=int(result[1])
                        data['checkins']=int(result[2])
                        return data
                    elif len(result)>=2:
                        data['likes']=int(result[0])
                        data['talking_about_count']=int(result[1])
        except Exception as e:
            log.error('Facebook url %s scrape error!' % url)
            log.error(e)
            pass
    return data
示例#5
0
def cal_fb_hm(fb_likes, fb_talking_about_count, fb_checkins):
    fb_metrics = {
        'fb_likes': fb_likes,
        'fb_talking_about_count': fb_talking_about_count,
        'fb_checkins': fb_checkins,
        'fb_tl': 0,
        'fb_chl': 0,
        'fb_combined': 0,
        'fb_likes_sqrt': 0,
        'fb_tchk_sqrt': 0,
        'fb_health': 0
    }
    if fb_likes == 0 and fb_talking_about_count == 0 and fb_checkins == 0:
        return fb_metrics
    try:
        fb_tl = float(fb_talking_about_count) / float(fb_likes) * 1500
        fb_chl = float(fb_checkins) / float(fb_likes) * 100
        fb_combined = fb_tl + fb_chl
        fb_likes_sqrt = float(fb_likes)**0.9
        fb_tchk_sqrt = (float(fb_talking_about_count) +
                        float(fb_checkins))**0.2
        fb_health = (((fb_combined**0.5) * fb_likes_sqrt * fb_tchk_sqrt) /
                     30000000)**0.65
        fb_metrics['fb_tl'] = fb_tl
        fb_metrics['fb_chl'] = fb_chl
        fb_metrics['fb_combined'] = fb_combined
        fb_metrics['fb_likes_sqrt'] = fb_likes_sqrt
        fb_metrics['fb_tchk_sqrt'] = fb_tchk_sqrt
        fb_metrics['fb_health'] = fb_health
    except Exception as e:
        log.error(e)
        pass
    return fb_metrics
示例#6
0
def fblogin():
    try:
        fblogin = FacebookLogin(fb_username, fb_password)
        urllib2.install_opener(fblogin.opener)
    except Exception as e:
        log.error(e)
    pass
示例#7
0
def scrap_facebook_raw_data(url):
    data = {'likes': 0, 'talking_about_count': 0, 'checkins': 0}
    if check_url(url, 'www.facebook.com'):
        number_pat = "[0-9]+"
        stat_pat = '<div class="fsm fwn fcg"><div class="fsm fwn fcg">([0-9]+)(.*)([0-9]+)(.*)([0-9]+)(.*)\w+</div></div>'
        try:
            with closing(urllib2.urlopen(url=url, timeout=30)) as page:
                content = page.read()
                content = re.sub(',', '', content)
                result = re.search(stat_pat, content)
                if result:
                    #print result.group()
                    result = re.findall(number_pat, result.group())
                    if len(result) >= 3:
                        data['likes'] = int(result[0])
                        data['talking_about_count'] = int(result[1])
                        data['checkins'] = int(result[2])
                        return data
                    elif len(result) >= 2:
                        data['likes'] = int(result[0])
                        data['talking_about_count'] = int(result[1])
        except Exception as e:
            log.error('Facebook url %s scrape error!' % url)
            log.error(e)
            pass
    return data
示例#8
0
def yt_scrape(url):
    """
        Scripe youtube url, get view_count, subscriber_count

        @type url: string
        @param url: youtube url

        @rtype:  dict
        @return: view_count, subscriber_count dict data
    """
    data = {'view_count': 0, 'subscriber_count': 0}
    youtube_data = None
    if check_url(url, 'www.youtube.com'):
        youtube_id = get_youtube_id(url)
        try:
            youtube_data = youtube_api.GetYouTubeUserEntry(username=youtube_id)
        except Exception as e:
            log.error('Youtube %s scrape error' % url)
            log.error(e)
            pass
    if youtube_data:
        data['view_count'] = int(youtube_data.statistics.view_count)
        data['subscriber_count'] = int(
            youtube_data.statistics.subscriber_count)
    return data
示例#9
0
文件: cron.py 项目: zhiwehu/scraper
def reSchedule(seconds=86400):
    '''
        Re-schedule the job with new interval.

        @type seconds: int
        @param seconds: the new interval seconds

        @rtype: None
        @return: None
    '''
    log.debug('job reschedule seconds %d' % seconds)
    try:
        sched.unschedule_func(doJob)
    except Exception as e:
        log.error(e)
        pass
    sched.add_interval_job(doJob, seconds=seconds)
示例#10
0
def reSchedule(seconds=86400):
    '''
        Re-schedule the job with new interval.

        @type seconds: int
        @param seconds: the new interval seconds

        @rtype: None
        @return: None
    '''
    log.debug('job reschedule seconds %d' % seconds)
    try:
        sched.unschedule_func(doJob)
    except Exception as e:
        log.error(e)
        pass
    sched.add_interval_job(doJob, seconds=seconds)
示例#11
0
def tw_scrape(url):
    """
        Scripe twitter url, get followers_count, tweets

        @type url: string
        @param url: twitter url

        @rtype:  dict
        @return: followers_count, tweets dict data
    """
    data = {'twitter_id': '', 'followers_count': 0, 'tweets': 0}
    twitter_data = None
    if check_url(url, 'twitter.com'):
        twitter_id = get_twitter_id(url)
        data['twitter_id'] = twitter_id
        try:
            twitter_data = twitter_api.GetUser(twitter_id)
        except Exception as e:
            log.error(e)
            pass
    if twitter_data:
        data['followers_count'] = twitter_data.followers_count
        data['tweets'] = twitter_data.statuses_count
    return data
示例#12
0
def yt_scrape(url):
    """
        Scripe youtube url, get view_count, subscriber_count

        @type url: string
        @param url: youtube url

        @rtype:  dict
        @return: view_count, subscriber_count dict data
    """
    data = {'view_count': 0, 'subscriber_count': 0}
    youtube_data = None
    if check_url(url, 'www.youtube.com'):
        youtube_id = get_youtube_id(url)
        try:
            youtube_data = youtube_api.GetYouTubeUserEntry(username=youtube_id)
        except Exception as e:
            log.error('Youtube %s scrape error' % url)
            log.error(e)
            pass
    if youtube_data:
        data['view_count'] = int(youtube_data.statistics.view_count)
        data['subscriber_count'] = int(youtube_data.statistics.subscriber_count)
    return data
示例#13
0
def tw_scrape(url):
    """
        Scripe twitter url, get followers_count, tweets

        @type url: string
        @param url: twitter url

        @rtype:  dict
        @return: followers_count, tweets dict data
    """
    data = {'twitter_id': '', 'followers_count': 0, 'tweets': 0}
    twitter_data = None
    if check_url(url, 'twitter.com'):
        twitter_id = get_twitter_id(url)
        data['twitter_id'] = twitter_id
        try:
            twitter_data = twitter_api.GetUser(twitter_id)
        except Exception as e:
            log.error(e)
            pass
    if twitter_data:
        data['followers_count'] = twitter_data.followers_count
        data['tweets'] = twitter_data.statuses_count
    return data
示例#14
0
def get_tw_data(twitter_id):
    data = {'impact': 0, 'engagement': 0, 'influence': 0, 'retweeted': 0, 'klout_truereach': 0}
    url = 'http://www.twitalyzer.com/api/2/user.asp?k=%s&u=%s&f=JSON' % (api_key, twitter_id)
    try:
        tw_api_data = urllib2.urlopen(url).read()
        if 'error' in tw_api_data:
            log.error('TWITTER NAME: %s:%s' %(twitter_id, tw_api_data))
        tw_api_data = tw_api_data.replace('[{', '').replace('}]', '')
        tw_api_data_list = tw_api_data.split(',')
        for item in tw_api_data_list:
            key = item.split(':')[0].strip()
            value = item.split(':')[1]
            if data.has_key(key):
                data[key] = value
    except Exception as e:
        log.error('Get twitter data error for %s' % twitter_id)
        log.error(e)
        pass
    return data
示例#15
0
文件: main.py 项目: zhiwehu/scraper
    def write_db(self, company_list, db_filename):
        """
            write CompanySocialMedia object list into sqlite3 database

            @type company_list: list
            @param company_list: the CompanySocialMedia object list

            @type db_filename: string
            @param db_filename: the sqlite database file name

            @rtype: int
            @return: insert total count
        """
        conn = sqlite3.connect(db_filename)
        #conn.text_factory = str
        c = conn.cursor()
        # Create table
        c.execute('''CREATE TABLE IF NOT EXISTS COMPANY
                 (
                 COMPANY_NAME TEXT,
                 FB_LIKES INTEGER,
                 FB_TALKING_ABOUT_COUNT INTEGER,
                 FB_CHECKINS INTEGER,
                 FB_TL REAL,
                 FB_CHL REAL,
                 FB_COMBINED REAL,
                 FB_LIKES_SQRT REAL,
                 FB_TCHK_SQRT REAL,
                 FB_HEALTH REAL,
                 TW_FOLLOWERS_COUNT INTEGER,
                 TW_TWEETS INTEGER,
                 TW_IMPACT REAL,
                 TW_ENGAGEMENT REAL,
                 TW_INFLUENCE REAL,
                 TW_RETWEETED REAL,
                 TW_KLOUT_TRUEREACH REAL,
                 TW_HEALTH REAL,
                 YT_SUBSCRIBER_COUNT INTEGER,
                 YT_VIEW_COUNT INTEGER,
                 YT_HEALTH REAL,
                 TSSH_RAW REAL,
                 TSSH_PWR_REDUCED REAL,
                 FB_PERCENT REAL,
                 TW_PERCENT REAL,
                 YT_PERCENT REAL,
                 FB_ABS REAL,
                 TW_ABS REAL,
                 YT_ABS REAL,
                 TIME_TAKEN TIMESTAMP
                 )''')
        count = 0
        for company in company_list:
            # Insert a row of data
            try:
                c.execute("INSERT INTO COMPANY VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                    (company.company_name,
                     company.fb_likes,
                     company.fb_talking_about_count,
                     company.fb_checkins,
                     company.fb_metrics['fb_tl'],
                     company.fb_metrics['fb_chl'],
                     company.fb_metrics['fb_combined'],
                     company.fb_metrics['fb_likes_sqrt'],
                     company.fb_metrics['fb_tchk_sqrt'],
                     company.fb_metrics['fb_health'],
                     company.tw_followers_count,
                     company.tw_tweets,
                     company.tw_metrics['impact'],
                     company.tw_metrics['engagement'],
                     company.tw_metrics['influence'],
                     company.tw_metrics['retweeted'],
                     company.tw_metrics['klout_truereach'],
                     company.tw_metrics['tw_health'],
                     company.yt_subscriber_count,
                     company.yt_view_count,
                     company.yt_metrics['yt_health'],
                     company.micro_metrics['tssh_raw'],
                     company.micro_metrics['tssh_pwr_reduced'],
                     company.micro_metrics['fb_percent'],
                     company.micro_metrics['tw_percent'],
                     company.micro_metrics['yt_percent'],
                     company.micro_metrics['fb_abs'],
                     company.micro_metrics['tw_abs'],
                     company.micro_metrics['yt_abs'],
                     company.time_taken
                        ))
                count += 1
            except Exception as e:
                log.error(e)
                pass

        conn.commit()
        c.close()
        conn.close()
        return count
示例#16
0
from contextlib import closing
import urllib2
from logUtil import log

import twitter
twitter_api = twitter.Api()

import gdata.youtube.service
youtube_api = gdata.youtube.service.YouTubeService()

import facebook
try:
    #access_token = facebook.get_app_access_token('193618104088301', '659217362b250bbdae0b61d1e437e8ca')
    access_token = None
except Exception as e:
    log.error(e)
    access_token = None
facebook_api = facebook.GraphAPI(access_token)


def check_url(url, netloc):
    return url and urlparse(url).netloc == netloc


def get_facebook_id(url):
    """
        Get facebook id or name from the url

        @type url: string
        @param url: facebook url
示例#17
0
from contextlib import closing
import urllib2
from logUtil import log

import twitter
twitter_api = twitter.Api()

import gdata.youtube.service
youtube_api = gdata.youtube.service.YouTubeService()

import facebook
try:
    #access_token = facebook.get_app_access_token('193618104088301', '659217362b250bbdae0b61d1e437e8ca')
    access_token = None
except Exception as e:
    log.error(e)
    access_token = None
facebook_api = facebook.GraphAPI(access_token)

def check_url(url, netloc):
    return url and urlparse(url).netloc == netloc

def get_facebook_id(url):
    """
        Get facebook id or name from the url

        @type url: string
        @param url: facebook url

        @rtype:  string
        @return: facebook id or name