def list_of_details_of_collection(category): f = open('data/' + category + '.txt', 'w') print('Finding results for ' + category) for collection in COLLECTIONS: for page in range(0, 42): try: scraper = play_scraper.collection(collection=collection, category=category, results=120, page=page) except: break list_of_ids = [] list_of_details = [] for item in scraper: list_of_ids.append(item['app_id']) for id in list_of_ids: a = play_scraper.details(id) b = { a['app_id']: [ a['title'], a['developer_id'], a['installs'], a['developer_url'], a['developer_email'] ] } list_of_details.append(b) try: f.write(str(b) + '\n') except: break print(category + ' Done\n')
def index(): trending = play_scraper.collection(gl = 'in', collection='NEW_FREE', category='GAME', results=120) trending_data=[] for item in trending: print(item) trending_data.append([item['app_id'],item['icon'],item['url'],item['title'],item['developer'],item['score'],item['price']]) return render_template('index.html',data=trending_data,len=len(trending_data))
def getIdList(collectionType, numResults=None, category=None): if (numResults is None): numResults = 1 return returnAppIds( ps.collection(collection=collectionType, results=numResults, category=category)) # usage print(getIdList('TRENDING',100))
def download_app_details(self, collection, category, pg_num): n_tries = 0 while n_tries < MAX_TRIES: n_tries += 1 print("Will crawl", collection, category, pg_num, n_tries) try: res = play_scraper.collection( collection=collection, category=category, results=APPS_PER_PAGE, page=pg_num, detailed=True) print(res) return res except Exception as exc: print("Error", n_tries, collection, category, pg_num, exc) pause = min(((2**n_tries) * 60 * 5), MAX_BACKOFF) # 5, 10... sleep(pause) return None
def scrapeCollectionScreenShots(collectionName): apps = scraper.collection(collection = collectionName) fileNameCount = 0 for appDict in apps: currentAppID = appDict['app_id'] currentAppDetailsDict = scraper.details(currentAppID) icoin = currentAppDetailsDict['icon'] appName = currentAppDetailsDict['title'] # if 'GAME' in currentAppDetailsDict['category']: # print('GameFound') # continue urllib.request.urlretrieve(icoin, appName+'.png') fileNameCount += 1
def scrape_and_save(collection, category=None, results=120, pages=5): date = datetime.datetime.now().strftime("%Y-%m-%d") csv_filename = "data/{}-{}-{}".format(date, collection, category) print("creating: {}".format(csv_filename)) scraped_array = [] for page in range(pages): scraped_array.extend( play_scraper.collection(collection=collection, category=category, results=results, page=page)) app_ids = [] for item in scraped_array: app_ids.append(item["app_id"]) app_ids = list(dict.fromkeys(app_ids)) app_details = [] for identification in app_ids: scraped_details = play_scraper.details(identification) if "developer_address" in scraped_details: del scraped_details["developer_address"] app_details.append(scraped_details) app_details = convert(app_details) csv_columns = app_details[0].keys() try: with open(csv_filename, 'wb') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=csv_columns) writer.writeheader() for data in app_details: writer.writerow(data) except IOError: print("I/O error")
import json import unicodecsv as csv import play_scraper import pandas as pd collection_name='TRENDING' category_name'ENTERTAINMENT' path_json ='./FolderNameToSaveResponseFiles/ENTERTAINMENT.json' path_csv ='./FolderNameToSaveResponseFiles/ENTERTAINMENT.csv' get_data = (play_scraper.collection(collection = collection_name, category = , results = 120, page = 1, detailed = True)) with open(".\\FolderNameToSaveResponseFiles\\ENTERTAINMENT.json", "w") as f1: json.dump(get_data, f1) f1.close() df = pd.read_json(path_json) df.to_csv(path_csv, encoding = 'utf-8')
#!/usr/bin/env python # coding: utf-8 # In[60]: import play_scraper # In[61]: trending = play_scraper.collection(gl='in', collection='TRENDING', results=120, page=2) # In[62]: import pandas as pd trending_data = [] for item in trending: trending_data.append([ item['app_id'], item['url'], item['title'], item['developer'], item['score'], item['price'] ]) df = pd.DataFrame(trending_data, columns=[ "URL", "Play_Store_URL", "Game_Name", "Company", "Rating", "Price" ]) df.to_csv('trending_data.csv', index=False)
def playstore(): trending_data = [] trending = play_scraper.collection(gl = 'in', collection=request.form['col'], category=request.form['cat'], results=120) for item in trending: trending_data.append([item['app_id'],item['icon'],item['url'],item['title'],item['developer'],item['score'],item['price']]) return render_template('result.html',data=trending_data,len=len(trending_data))
+ app_id[12][0] + "', 1, CONVERT_TIMEZONE('JST', SYSDATE) WHERE NOT EXISTS ( SELECT app_id FROM superset_schema.app_ids WHERE app_id = '" + app_id[12][0] + "'); ") if DEBUG: print(str(i) + ": " + app_id[12][0]) i = i + 1 # 簡単にスクレイピング # Todo コネクション多すぎるのでforをコネクションの中に入れる count = 1 for COLLECTION_NAME in ['NEW_FREE', 'TOP_FREE']: for i in range(5): if DEBUG: print("{}".format(COLLECTION_NAME)) collections = play_scraper.collection(collection=COLLECTION_NAME, results=120, page=i) for collection in collections: detail = play_scraper.details(collection['app_id']) if -1 != detail['category'][0].find("GAME"): with get_connection() as conn: with conn.cursor() as cur: cur.execute( "INSERT INTO superset_schema.app_ids(app_id, platform, created_at) SELECT %s, 1, CONVERT_TIMEZONE('JST', SYSDATE) WHERE NOT EXISTS ( SELECT app_id FROM superset_schema.app_ids WHERE app_id = %s); ", (detail['app_id'], detail['app_id'])) if DEBUG: print(str(count) + ": " + detail['app_id']) count = count + 1 with open(LOG, mode='a') as f:
# Fetches the categories from 'get_categories()' categories = get_categories() downloads_per_country_per_category = {} for category in tqdm(categories): downloads_per_country_by_category = {} # Print the category that is being scraped tqdm.write('Category: %s ' % category) for country in countries: # For each category and each country, scrape the play store and write the information to output/app_info/ foler (according to the appropriate file name) list_top_n_apps_by_categry = \ play_scraper.collection(collection='TOP_FREE', gl=country, category=category, results=number_of_results, detailed=True) # Write to output/app_info/{country}_{category}_{number_of_results}.json with open('output/app_info/%s_%s_%s.json' % (country, category, number_of_results), 'w', encoding='utf-8') as \ file_pointer: json.dump(list_top_n_apps_by_categry, file_pointer, indent=4) # Find the number of installs (in thousands) for each app in each country and category. installs = [int(re.sub('[,+]', '', app['installs']))/1000 for app in list_top_n_apps_by_categry] # Find the sum of number of downloads for each country for each category. downloads_per_country_by_category[country] = sum(installs)
"MUSIC_AND_AUDIO", "NEWS_AND_MAGAZINES", "PERSONALIZATION", "PHOTOGRAPHY", "PRODUCTIVITY", "SHOPPING", "SPORTS", "WEATHER"] apps=[] for cat in CATEGORIES: for topic in collections: for p in range(0,4): g = play_scraper.collection(collection=topic, category = cat, results = 100, page = p, detailed = True, hl = 'en') apps.extend(g) print(p) time.sleep(30) app_price=[] app_score=[] app_name=[] app_description=[] app_category = [] for i in range(len(apps)): app_name.append(apps[i]['title']) app_description.append(apps[i]['description']) app_score.append(apps[i]['score'])
import play_scraper from csv import writer with open('spotify.csv', 'w', encoding="utf-8") as csv_file: csv_writer = writer(csv_file) headers = ["Details", "Similar", "Search", "Trending"] csv_writer.writerow(headers) details = play_scraper.details('com.spotify.music', hl='en', gl='in') similar = play_scraper.developer('Spotify Ltd.', results=5, hl='en', gl='in') search = play_scraper.search('com.spotify.music', detailed=True, hl='en', gl='in') collection = play_scraper.collection(collection='TRENDING', category='MUSIC_AND_AUDIO', results=10, hl='en', gl='in', page=1) csv_writer.writerow([details, similar, search, collection]) #print(play_scraper.details('com.spotify.music')) #print(play_scraper.collection(collection='TRENDING',category='MUSIC_AND_AUDIO',results=10,hl='en',gl = 'in',page=1)) #print(play_scraper.search('com.spotify.music',detailed=True, hl='en', gl='in')) #print(play_scraper.developer('Spotify Ltd.', results=5))
ws = wb.add_sheet(m) ws.write(0, 0, "App_Id") ws.write(0, 1, "URL") ws.write(0, 2, "Icon") ws.write(0, 3, "Title") ws.write(0, 4, "Developer") ws.write(0, 5, "Developer_id") ws.write(0, 6, "Description") ws.write(0, 7, "Score") ws.write(0, 8, "Price") ws.write(0, 9, "Free") for u in collect: try: app = play_scraper.collection(collection=u, category=m, results=50, page=0) for i in app: for j in i.values(): ws.write(k, l, j) l = l + 1 k = k + 1 l = 0 except: continue wb.save("PlayStore.xls") print("finally created")