def single(article_id, outputcsv):
    """produce scraper report of apps on a single applist given the article id -
    some output values are dummied because they are usually obtained from the input file (in main)"""
    APP_LIST_IDX_app_id = 0
    APP_LIST_IDX_itunes_link = 1
    ARTICLE_TYPE = 2

    # open output file
    with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh:
        writer = csv.writer(outfileh,
                            delimiter=',',
                            quotechar='"',
                            escapechar='~',
                            doublequote=False,
                            quoting=csv.QUOTE_NONNUMERIC)

        # write out headings
        # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop
        writer.writerow([
            'article_id', 'article_url', 'published_at', 'app_id',
            'itunes_link'
        ])

        # work out applist url based on article ID
        article_url = articleUrls(ARTICLE_TYPE, article_id, 'dont-know-slug')
        try:
            article_url, soup = openurlforsoup(article_url)
        except urllib.error.HTTPError:
            try:
                article_url, soup = openurlforsoup(article_url)  # try again
            except urllib.error.HTTPError as e:
                print("HTTP error opening article_id {}: {}".format(
                    article_id, e.reason))
                raise
            except e:
                print("General error opening article id {}: {}".format(
                    article_id, e.reason))
                raise

        published_at = '????'
        apps = getappsfromlist(soup)

        for app in apps:
            writer.writerow([
                article_id, article_url, published_at,
                app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link]
            ])

        print(str(article_url))

    return None
def single(article_id, outputcsv):
    """produce scraper report of apps on a single applist given the article id -
    some output values are dummied because they are usually obtained from the input file (in main)"""
    APP_LIST_IDX_app_id = 0
    APP_LIST_IDX_itunes_link = 1
    ARTICLE_TYPE = 2

    # open output file
    with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh:
        writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC)

        # write out headings
        # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop
        writer.writerow(['article_id', 'article_url', 'published_at', 'app_id', 'itunes_link'])

        # work out applist url based on article ID
        article_url = articleUrls(ARTICLE_TYPE, article_id, 'dont-know-slug')
        try:
            article_url, soup = openurlforsoup(article_url)
        except urllib.error.HTTPError:
            try:
                article_url, soup = openurlforsoup(article_url) # try again
            except urllib.error.HTTPError as e:
                print("HTTP error opening article_id {}: {}".format(article_id, e.reason))
                raise
            except e:
                print("General error opening article id {}: {}".format(article_id, e.reason))
                raise

        published_at = '????'
        apps = getappsfromlist(soup)

        for app in apps:
            writer.writerow([article_id, article_url, published_at, app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link]])

        print(str(article_url))
        
    return None
def main(
    inputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv',
    hasheader=True,
    outputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\applist_scraper_report.csv'
):

    APP_LIST_IDX_app_id = 0
    APP_LIST_IDX_itunes_link = 1

    # open output file
    with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh:
        writer = csv.writer(outfileh,
                            delimiter=',',
                            quotechar='"',
                            escapechar='~',
                            doublequote=False,
                            quoting=csv.QUOTE_NONNUMERIC)
        # write out headings
        # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop
        writer.writerow([
            'article_id', 'article_url', 'published_at', 'app_id',
            'itunes_link'
        ])

        # open input file
        with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh:
            reader = csv.DictReader(inputfileh,
                                    fieldnames=('article_id', 'article_type',
                                                'published_at', 'slug'),
                                    delimiter=',',
                                    quotechar='"')
            i = 1
            if hasheader: next(reader)  # skip header row

            for row in reader:
                if i % 10 == 0: print('Record: {}'.format(i))
                article_id = row['article_id']
                article_type = row['article_type']
                slug = row['slug']
                article_url = articleUrls(article_type, article_id, slug)
                print(str(article_url))

                # replace the following row with value scraped from page
                #published_at = row['published_at']

                # scrape the apps on this article
                #writer.writerow([article_id, article_url, 'APPS'])
                try:
                    article_url, soup = openurlforsoup(str(article_url))
                except urllib.error.HTTPError:
                    print("Error while trying to open article_url: {0}".format(
                        article_url))
                    try:
                        article_url, soup = openurlforsoup(
                            article_url)  # try again
                    except urllib.error.HTTPError as e:
                        print("HTTP error opening article_id {}: {}".format(
                            article_id, e.reason))
                        writer.writerow([
                            article_id, article_url,
                            "Error opening article: {0}".format(e.reason), "-",
                            "-"
                        ])
                        continue
                    except e:
                        print("General error opening article id {}: {}".format(
                            article_id, e.reason))
                        raise

                published_at = soup.findAll('p', class_='article-date small')
                if not published_at:
                    print(
                        ('   This page\'s HTML does not contain class \'{}\''.
                         format('article-date small')).encode(
                             'ascii', 'ignore').decode('utf-8'))
                    published_at = 'Couldn' 't find date on page'
                else:
                    published_at = published_at[0].getText()

                    # extracts a string like '21 Jun 2016, by\xa0Cherry Mae  Torrevillas'
                    # so get only text before first comma by splitting at most once on the separator
                    published_at = published_at.split(',', 1)[0]

                apps = getappsfromlist(soup)
                for app in apps:
                    writer.writerow([
                        article_id, article_url, published_at,
                        app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link]
                    ])

                i += 1
                #if i==50:break
        inputfileh.close()
    outfileh.close()

    return None
示例#4
0
    def main(
            inputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv',
            outputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\article_visits_up.csv',
            start_date_str='2016-01-01',
            end_date_str='2016-01-31',
            trustslugs=True):
        """ Generate Google Analytics report for articles listed in the input CSV. If trustslugs = False, then do
            http queries of appPicker for the real URLs instead of constructing URLs from the slugs in the input CSV """
        broker = pageanalytics.Broker()

        # open output file
        with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh:
            writer = csv.writer(outfileh,
                                delimiter=',',
                                quotechar='"',
                                escapechar='~',
                                doublequote=False,
                                quoting=csv.QUOTE_NONNUMERIC)

            # write out headings
            # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop
            writer.writerow([
                'article_id', 'published_at', 'page_url', 'users', 'new_users',
                'sessions', 'bounces', 'bounce_rate', 'avg_session',
                'page_views', 'avg_time_on_page', 'avg_page_load_secs',
                'sessions_per_user'
            ])

            # open input file
            with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh:
                reader = csv.DictReader(inputfileh,
                                        fieldnames=('article_id',
                                                    'article_type',
                                                    'published_at', 'slug'),
                                        delimiter=',',
                                        quotechar='"')
                i = 1
                for i in range(1):
                    next(reader)  # skip header row
                for row in reader:
                    article_id = row['article_id']
                    article_type = row['article_type']
                    published_at = row.get('published_at', 'not provided')
                    slug = row.get('slug', 'blah')
                    if trustslugs:
                        article_url = articleUrls(article_type, article_id,
                                                  slug)
                    else:
                        try:
                            article_url = articleUrls(article_type, article_id,
                                                      'blah').realurl()
                        except ArticleLoadError as e:
                            writer.writerow([
                                article_id, published_at,
                                'Error: {0}'.format(e.customMessage), 'NULL',
                                'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL',
                                'NULL', 'NULL', 'NULL'
                            ])
                            print('Could not open URL: {0}'.format(
                                e.customMessage))
                            continue

                    print(
                        str(article_url).replace('http://www.apppicker.com',
                                                 ''))

                    # get Google Analytics results for period between start and end dates
                    try:
                        garesults = broker.get_results(
                            pagePath=str(article_url).replace(
                                'http://www.apppicker.com', ''),
                            start_date=start_date_str,
                            end_date=end_date_str,
                            metrics=
                            'ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate'
                        )
                    except googleapiclient.errors.HttpError as e:
                        time.sleep(5)
                        garesults = broker.get_results(
                            pagePath=str(article_url).replace(
                                'http://www.apppicker.com', ''),
                            start_date=start_date_str,
                            end_date=end_date_str,
                            metrics=
                            'ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate'
                        )
                    metrics = google.extract_metrics(garesults)
                    #                print('{}'.format(json.dumps(row)))

                    writer.writerow([
                        article_id, published_at, article_url,
                        metrics.get('ga:users', 0),
                        metrics.get('ga:newUsers', 0),
                        metrics.get('ga:sessions'),
                        metrics.get('ga:bounces', 0),
                        metrics.get('ga:bounceRate', 0),
                        metrics.get('ga:avgSessionDuration', 0),
                        metrics.get('ga:pageviews', 0),
                        metrics.get('ga:avgTimeOnPage', 0),
                        metrics.get('ga:avgPageLoadTime', 0),
                        metrics.get('ga:sessionsPerUser', 0)
                    ])
                    i += 1
                    if i % 10 == 0:
                        print('Record: {}'.format(i))
                        outfileh.flush()
                    #if i==10:break
            inputfileh.close()
        outfileh.close()

        return None
def main(inputcsv='D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv', hasheader=True,
         outputcsv = 'D:\\projects\\AppPicker\\reports\\best of lists performance\\applist_scraper_report.csv'):

    APP_LIST_IDX_app_id = 0
    APP_LIST_IDX_itunes_link = 1

    # open output file
    with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh:
        writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC)
        # write out headings
        # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop
        writer.writerow(['article_id', 'article_url', 'published_at', 'app_id', 'itunes_link'])

        # open input file
        with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh:
            reader = csv.DictReader(inputfileh, 
                                    fieldnames=('article_id', 'article_type', 'published_at', 'slug'),
                                    delimiter=',', 
                                    quotechar='"')
            i = 1
            if hasheader: next(reader) # skip header row

            for row in reader:
                if i % 10 == 0: print('Record: {}'.format(i))
                article_id = row['article_id']
                article_type = row['article_type']
                slug = row['slug']
                article_url = articleUrls(article_type, article_id, slug)
                print(str(article_url))

                # replace the following row with value scraped from page
                #published_at = row['published_at']

                # scrape the apps on this article
                #writer.writerow([article_id, article_url, 'APPS'])
                try:
                    article_url, soup = openurlforsoup(str(article_url))
                except urllib.error.HTTPError:
                    print("Error while trying to open article_url: {0}".format(article_url))
                    try:
                        article_url, soup = openurlforsoup(article_url) # try again
                    except urllib.error.HTTPError as e:
                        print("HTTP error opening article_id {}: {}".format(article_id, e.reason))
                        writer.writerow([article_id, article_url, "Error opening article: {0}".format(e.reason),"-","-"])
                        continue
                    except e:
                        print("General error opening article id {}: {}".format(article_id, e.reason))
                        raise
                
                published_at = soup.findAll('p',class_='article-date small')
                if not published_at:
                    print(('   This page\'s HTML does not contain class \'{}\''.format('article-date small')).encode('ascii', 'ignore').decode('utf-8'))
                    published_at = 'Couldn''t find date on page'
                else:
                    published_at = published_at[0].getText()

                    # extracts a string like '21 Jun 2016, by\xa0Cherry Mae  Torrevillas'
                    # so get only text before first comma by splitting at most once on the separator
                    published_at = published_at.split(',', 1)[0]

                apps = getappsfromlist(soup)
                for app in apps:
                    writer.writerow([article_id, article_url, published_at, app[APP_LIST_IDX_app_id], app[APP_LIST_IDX_itunes_link]])

                i += 1
                #if i==50:break
        inputfileh.close()
    outfileh.close()

    return None
示例#6
0
    def main(inputcsv = 'D:\\projects\\AppPicker\\reports\\best of lists performance\\ap_article.csv',
             outputcsv = 'D:\\projects\\AppPicker\\reports\\best of lists performance\\article_visits_up.csv',
             start_date_str = '2016-01-01',
             end_date_str = '2016-01-31',
             trustslugs = True):
        """ Generate Google Analytics report for articles listed in the input CSV. If trustslugs = False, then do
            http queries of appPicker for the real URLs instead of constructing URLs from the slugs in the input CSV """
        broker = pageanalytics.Broker()

        # open output file
        with open(outputcsv, 'w', newline='', encoding='utf-8') as outfileh:
            writer = csv.writer(outfileh, delimiter=',', quotechar='"', escapechar='~', doublequote=False, quoting=csv.QUOTE_NONNUMERIC)

            # write out headings
            # these headings depend on the metrics requested in call to broker.get_results, and match values to writer.writerow at end of this loop
            writer.writerow(['article_id', 'published_at', 'page_url', 'users', 'new_users', 'sessions', 'bounces', 'bounce_rate',
                             'avg_session', 'page_views', 'avg_time_on_page', 'avg_page_load_secs', 'sessions_per_user'])

            # open input file
            with open(inputcsv, newline='\n', encoding='utf-8') as inputfileh:
                reader = csv.DictReader(inputfileh, 
                                        fieldnames=('article_id', 'article_type', 'published_at', 'slug'),
                                        delimiter=',', 
                                        quotechar='"')
                i = 1
                for i in range(1):
                    next(reader) # skip header row
                for row in reader:
                    article_id = row['article_id']
                    article_type = row['article_type']
                    published_at = row.get('published_at', 'not provided')
                    slug = row.get('slug', 'blah')
                    if trustslugs:
                        article_url = articleUrls(article_type, article_id, slug)
                    else:
                        try:
                            article_url = articleUrls(article_type, article_id, 'blah').realurl()
                        except ArticleLoadError as e:
                            writer.writerow([article_id, published_at, 'Error: {0}'.format(e.customMessage), 'NULL', 'NULL', 'NULL', 'NULL', 'NULL',
                                             'NULL', 'NULL', 'NULL', 'NULL', 'NULL'])
                            print('Could not open URL: {0}'.format(e.customMessage))
                            continue

                    print(str(article_url).replace('http://www.apppicker.com',''))

                    # get Google Analytics results for period between start and end dates
                    try:
                        garesults = broker.get_results(pagePath=str(article_url).replace('http://www.apppicker.com',''), 
                                                       start_date=start_date_str, end_date=end_date_str,
                                                       metrics='ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate')
                    except googleapiclient.errors.HttpError as e:
                        time.sleep(5)
                        garesults = broker.get_results(pagePath=str(article_url).replace('http://www.apppicker.com',''), 
                                                       start_date=start_date_str, end_date=end_date_str,
                                                       metrics='ga:sessions,ga:pageviews,ga:users,ga:newUsers,ga:bounces,ga:avgTimeOnPage,ga:sessionsPerUser,ga:avgPageLoadTime,ga:avgSessionDuration,ga:bounceRate')
                    metrics = google.extract_metrics(garesults)
    #                print('{}'.format(json.dumps(row)))

                    writer.writerow([article_id, published_at, article_url, metrics.get('ga:users',0), metrics.get('ga:newUsers',0), 
                                     metrics.get('ga:sessions'), metrics.get('ga:bounces',0), metrics.get('ga:bounceRate',0), metrics.get('ga:avgSessionDuration',0), 
                                     metrics.get('ga:pageviews',0), metrics.get('ga:avgTimeOnPage',0), metrics.get('ga:avgPageLoadTime',0), metrics.get('ga:sessionsPerUser',0)])
                    i += 1
                    if i % 10 == 0:
                        print('Record: {}'.format(i))
                        outfileh.flush()
                    #if i==10:break
            inputfileh.close()
        outfileh.close()

        return None