def upload_rewards_from_html_database(html_database, offset=0, chunk_size=10000, output_failed_projects=True): if output_failed_projects: Rewards.failed_projects_output_file = open('failed_reward_project_ids.txt', 'wb') print "Getting existing rewards" t1 = time.time() kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter') existing_project_ids = set([row['projectid'] for row in kickstarter_db.query( 'select distinct projectid from reward')]) print '\ttook {0}'.format(time.time() - t1) while True: # have to reconnect or else it will lose connection after a while kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter') print "Getting html from database, from {0} to {1}".format(offset, offset + chunk_size) t1 = time.time() results = html_database.query('select projectid, html, url from reward_html limit {0}, {1}'.format(offset, chunk_size)) print '\ttook {0}'.format(time.time() - t1) print "Parsing html" t1 = time.time() rewards = [] # see if the offset is past the end of the database end_of_database = True for index, row in enumerate(results): end_of_database = False if row['projectid'] not in existing_project_ids: r = Rewards(row['html'], row['projectid']).get_rewards() if r is not None: rewards += r # break if offset is past the end of the database if end_of_database: print "Nothing to parse: end of database" break print '\ttook {0}'.format(time.time() - t1) print "Uploading projects" t1 = time.time() kickstarter_db['reward'].insert_many(ensure=False, rows=rewards) print '\ttook {0}'.format(time.time() - t1) offset += chunk_size if output_failed_projects: Rewards.failed_projects_output_file.close()
def update_backers_db_from_queue(project_queue, fail_queue, process_id): ff_profile = os.getcwd() + "\\lib\\quick_firefox" browser = splinter.Browser('firefox', profile=ff_profile) while True: p = project_queue.get() if p is None: # signal that there are no more projects browser.quit() project_queue.task_done() break url = p['url'] projectid = p['id'] try: backers = get_backers_from_url(url=url, max_wait=30, browser=browser) data = tablib.Dataset() data.headers = ['projectid', 'userid', 'name', 'raw_location'] for backer in backers: row = (projectid, backer['id'], backer['name'], backer['raw_location']) data.append(row) db = db_connections.get_fungrosencrantz_schema( schema='kickstarter') db_connections.uploadOutputFile(data=data, db=db, table='backer') del db print "Success: " + url except: fail_queue.put(url) print "Failed: " + url project_queue.task_done()
def select(): db = db_connections.get_fungrosencrantz_schema('crowdrise') t0 = time.time() q = [x for x in db.query('select * from html limit 300')] logging.debug('reading {} files took {} seconds'.format( len(q), time.time() - t0))
def run(start_date, end_date, file_name): """ :param start_date: Consider projects that have a start date after this date (format YYYY/MM/DD) :param end_date: Consider projects that have a start date before this date (format YYYY/MM/DD) :param file_name: Output file name. :return: """ queries = { 'project': """select * from project left join location on project.location_slug = location.slug where start_date between {} and {};""".format(start_date, end_date), 'reward': """select * from reward join (project left join location on project.location_slug = location.slug) on reward.projectid = project.id where start_date between {} and {};""".format(start_date, end_date), 'update': """select * from `update` join (project left join location on project.location_slug = location.slug) on `update`.projectid = project.id where start_date between {} and {};""".format(start_date, end_date) } db = db_connections.get_fungrosencrantz_schema('kickstarter') writer = pd.ExcelWriter(file_name + '.xlsx', engine='xlsxwriter', options={'strings_to_urls': False}) for table in queries: print "downloading {}".format(table) df = pd.read_sql(queries[table], db.executable) print "inserting into spreadsheet" df.to_excel(writer, table) print "saving spreadsheet" writer.save()
def generate_statistics_file(table, columns, table_columns=None): db = db_connections.get_fungrosencrantz_schema(traditional=True) for c in columns: if table_columns is None: assert c in db[table].columns else: assert c in table_columns q, headers = generate_full_statistics_query(table=table, columns=columns) data = [] try: for i, row in enumerate(db.query(q)): row['Name'] = columns[i] data.append(row) except: print(q) time.sleep(0.2) raise tablib_data_unordered = tablib.Dataset() tablib_data_unordered.dict = data tablib_data = tablib.Dataset() for header in ['Name'] + headers: tablib_data.append_col(tablib_data_unordered[header], header=header) tablib_data.headers = ['Name'] + headers with open('{0}_statistics.xlsx'.format(table), 'wb') as f: f.write(tablib_data.xlsx)
def download_update_pages(skip_existing=True, only_updated_projects=False): db = db_connections.get_fungrosencrantz_schema( 'kickstarter_new') # TODO: change after move orig_dir = os.getcwd() os.chdir('/mnt/data/scrape/kickstarter_updates') with open('all_urls', 'w') as f: for row in db.query('select url from all_files'): url = row['url'] if url.endswith('/'): f.write(url + 'updates' + '\n') else: f.write(url + '/updates' + '\n') if only_updated_projects: raise NotImplementedError() else: if skip_existing: subprocess.call( 'wget -i all_urls --no-clobber --force-directories --output-file=wget.log', shell=True) else: subprocess.call( 'wget -i all_urls --timestamping --force-directories --output-file=wget.log', shell=True) os.chdir(orig_dir)
def scrape_location(page_source): tree = lxml.html.fromstring(page_source) locations = dict() location_sections = tree.xpath('//div[@class="project-location"]/a') # this comes from the website LOCATION_KEYS = [ 'name', 'short_name', 'country', 'id', 'is_root', 'state', 'urls', 'type', 'displayable_name', 'slug' ] for l_section in location_sections: location = json.loads(s=l_section.attrib['data-location']) if location.keys() != LOCATION_KEYS: print location.keys() raise Exception("Bad location") location['status'] = location['state'] del location['state'] del location['urls'] name = location['displayable_name'] locations[name] = location db = db_connections.get_fungrosencrantz_schema(schema='kickstarter', traditional=True) db_connections.uploadOutputFile(data=locations.values(), db=db, table='location', strict=True)
def run(): from utils.useful_functions import ensure_directory sql = "select id as project_id, goal, amount_pledged, backer_count, category, year(start_date) as start_year, currency from project where currency is not Null" db = db_connections.get_fungrosencrantz_schema('kickstarter') df = pd.read_sql(sql=sql, con=db.executable) convert_kickstarter_currency_df_to_usd(df, replace=True) base_directory = '/usr/share/nginx/html/crowdfunding/my/scott/kickstarter/goal_vs_actual' original_directory = os.getcwd() os.chdir(base_directory) data_directory = 'analysis_output' # def start_date_to_str(row): # return row['start_date'].isoformat() # df['start_date'] = df.apply(start_date_to_str, axis=1) mapping = dict() for year_category, year_category_df in df.groupby( ['start_year', 'category']): year = year_category[0] category = year_category[1] cur_directory = '{}/{}'.format(data_directory, year) ensure_directory(cur_directory) output_file = '{}/{}.json'.format(cur_directory, category) if year not in mapping.keys(): mapping[year] = dict() mapping[year][category] = output_file with open(output_file, 'w') as f: json.dump(year_category_df.to_dict(orient='records'), f) ensure_directory(data_directory) with open('{}/mapping.json'.format(data_directory), 'w') as f: json.dump(mapping, f) os.chdir(original_directory)
def delete_redundant_directories(): db = db_connections.get_fungrosencrantz_schema('crowdrise') urls = [x['url'] + '/*' for x in db.query('select url from fundraiser where url not like "%/fundraiser/%";')] urls_split = split_array_into_chunks(urls, 10000) for chunk in urls_split: with open('directories_to_remove', 'w') as f: f.write(" ".join(chunk)) subprocess.call('rm -Rf `cat directories_to_remove`', shell=True)
def get_projects_with_null_backerid(offset=0, limit=1000, project_table='project', backer_table='backer'): db = db_connections.get_fungrosencrantz_schema('kickstarter') q = """SELECT id, url FROM {0} left join backer on {0}.id = {1}.projectid where projectid is null and backers_count > 50 order by backers_count asc""".format(project_table, backer_table) if limit is not None: q = q + "\nlimit {0}, {1}".format(offset, limit) return db.query(q)
def get_projects_with_null_backerid(offset=0, limit=1000, project_table='project', backer_table='backer'): db = db_connections.get_fungrosencrantz_schema('kickstarter') results = db.query( """SELECT id, url FROM {0} left join backer on {0}.id = {1}.projectid where projectid is null and backers_count > 0 limit {2}, {3}""".format(project_table, backer_table, offset, limit)) return [x for x in results]
def download_update_pages(workers): kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter') # get the urls and their corresponding ids to download ids_for_base_urls = db_connections.get_ids_for_base_urls( db=kickstarter_db, table_name='project') webpageDownloader.urls_to_database( ids_for_base_urls=ids_for_base_urls, db_connector=db_connections.get_intermediate_db, html_table_name='update_html', url_append='/updates', verbose_level=2, auto_adjust_workers=False, max_workers=workers, chunk_size=104)
def upload_scraped_projects(scraped_projects, backer_table='backer', schema='kickstarter'): db = db_connections.get_fungrosencrantz_schema(schema) data = tablib.Dataset() data.headers = tuple(db[backer_table].columns) for i in range(len(projects)): row = { data.headers[0]: projects[i][0], data.headers[1]: projects[i][1], data.headers[2]: projects[i][2] } data.append(row) return data
def download_reward_pages(workers): kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter') ids_for_base_urls = db_connections.get_ids_for_base_urls( db=kickstarter_db, table_name='project', query_append="", scrape_table_name='reward', scrape_table_column_name='projectid') webpageDownloader.urls_to_database(ids_for_base_urls, db_connections.get_intermediate_db, 'reward_html', url_append='/rewards', chunk_size=104, max_workers=workers, verbose_level=2)
def query_to_excel(query, chunk_size=10, outfile='output.xlsx', schema='Kiva'): from utils.useful_functions import split_array_into_chunks writer = pd.ExcelWriter(outfile, engine='xlsxwriter', options={'strings_to_urls': False}) db = db_connections.get_fungrosencrantz_schema(schema) df = pd.read_sql(query, con=db.executable) print "done downloading" sheet_num = 0 for df_chunk in split_array_into_chunks(df, chunk_size=chunk_size): sheet_num += 1 if len(df_chunk) == 0: break df_chunk.to_excel(writer, sheet_name='sheet{}'.format(sheet_num)) writer.save()
def output_founder_graphs(): db = db_connections.get_fungrosencrantz_schema('kickstarter') fig = plt.figure(figsize=(8, 6)) query = """ SELECT collapsed_projects_by_founder AS `num_projects_by_founder`, sum(num_successful) `num_successful`, sum(num_failed) `num_failed`, sum(num_other) `num_other`, sum(num_successful)/sum(projects_by_founder) `success_rate`, count(*) `num_founders` FROM ( SELECT founder_id, count(*) AS projects_by_founder, #IF(count(*) <= 10, count(*), 11) AS collapsed_projects_by_founder, count(*) AS collapsed_projects_by_founder, sum(status = 'successful') AS num_successful, sum(status = 'failed') `num_failed`, sum(status not in ('successful', 'failed')) `num_other` FROM project JOIN location ON project.location_slug = location.slug WHERE location.country = 'US' AND start_date < (SELECT min(start_date) FROM project WHERE status = 'live') GROUP BY founder_id) AS t1 GROUP BY collapsed_projects_by_founder; """ df = pd.read_sql(query, db.engine) y = df['num_founders'].values x = df['num_projects_by_founder'].values # size = np.sqrt(df['num_founders'].values * 200) ax = fig.add_axes((.18, .2, .80, .75)) plot = ax.scatter(x, y, marker='o', c=df['success_rate'], cmap='gray_r') fig.colorbar(plot, label='Success Rate') plt.title('Number of Projects Made by a Founder vs Number of Founders') plt.xlabel('Number of Projects Made by Founder') plt.ylabel('Number of Founders') ax.set_yscale('log') plt.savefig('plot_founder_distribution.png', dpi=350)
def download_small_backers(workers): html_db = db_connections.get_intermediate_db() kickstarter_db = db_connections.get_fungrosencrantz_schema('kickstarter') # get the urls and their corresponding ids to download ids_for_base_urls = db_connections.get_ids_for_base_urls( db=kickstarter_db, table_name='project', query_append='and backers_count <= 50 and backers_count > 0', scrape_table_name='backer', scrape_table_column_name='projectid') webpageDownloader.urls_to_database( ids_for_base_urls=ids_for_base_urls, db_connector=db_connections.get_intermediate_db, html_table_name='backer_html', url_append='/backers', verbose_level=2, auto_adjust_workers=False, max_workers=workers)
def clean_sitemap_table(db=None): if db is None: db = db_connections.get_fungrosencrantz_schema('crowdrise') q = """UPDATE sitemap SET loc=left(loc, char_length(loc)-1) WHERE right(loc, 1) = '/' LIMIT 100000000000;""" db.query(q) q = """UPDATE sitemap SET loc=REPLACE(loc, 'http:', 'https:') WHERE loc LIKE 'http:%' LIMIT 100000000000;""" db.query(q) q = """DELETE FROM sitemap WHERE loc='https://www.crowdrise.com';""" db.query(q)
def download_plot_data(): db = db_connections.get_fungrosencrantz_schema('kickstarter') query = """ SELECT concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`, backer_count, new_backers, repeat_backers, amount_pledged, currency, start_date, category, status, goal, founder_id FROM project JOIN location ON project.location_slug = location.slug WHERE location.country = 'US' AND NOT (year(start_date) = year(now()) AND quarter(start_date) = quarter(now())) order by rand(1) """ full_df = pd.read_sql(query, db.engine, parse_dates=['start_date']) useful_functions.convert_currency( full_df, currency_column='currency', money_column='amount_pledged', date_column='start_date') del full_df['currency'] del full_df['start_date'] full_df.to_pickle('plot_data.pickle')
def upload_community_data(community_data=None): if community_data is None: with open('community_data.pickle', 'rb') as f: community_data = pickle.load(f) project_data = [] backers_in_city_data = [] backers_in_country_data = [] for project in community_data: backers_in_city_data += project.pop('backers_by_city') backers_in_country_data += project.pop('backers_by_country') project_data.append(project) db = db_connections.get_fungrosencrantz_schema('kickstarter') # db_connections.uploadOutputFile(data=project_data, db=db, table='project') db_connections.uploadOutputFile(data=backers_in_city_data, db=db, table='backers_in_city') db_connections.uploadOutputFile(data=backers_in_country_data, db=db, table='backers_in_country') '''
def run(): db = db_connections.get_fungrosencrantz_schema('crowdrise') chunk_size = 100 html_data = [] for chunk_index, chunk in enumerate( split_array_into_chunks([ x['file_name'] for x in db.query( '''select file_name from all_files left join html on all_files.file_name = html.url where html.url is null;''') ], chunk_size=chunk_size)): t0 = time.time() for file_index, file_path in enumerate(chunk): cur_index = chunk_index * chunk_size + file_index try: with open(file_path, 'rb') as f: f_read = f.read() except IOError: logging.error(traceback.format_exc()) try: with open(file_path.replace('E:\\', ''), 'rb') as f: f_read = f.read() except IOError: with open('files_not_found', 'a') as f: f.write(file_path + '\n') continue # logging.debug('{}: {}'.format(cur_index, file_path)) html_data.append( dict( url=file_path, html=f_read, last_scrape=time.gmtime(os.path.getmtime(file_path)), )) db['html'].insert_many(html_data, ensure=False) logging.debug('inserted through {}; took {}'.format( cur_index, time.time() - t0)) html_data = []
def get_all_pages_to_download(db=None): if db is None: db = db_connections.get_fungrosencrantz_schema('crowdrise') q = """SELECT CONCAT(t2.loc, '/fundraiser/', REPLACE(t2.loc, 'https://www.crowdrise.com/', '')) AS loc FROM sitemap AS t1 JOIN sitemap AS t2 ON t1.loc = t2.loc AND t1.category = 'users' AND t2.category = 'fundraisers' UNION SELECT loc FROM sitemap; """ logging.info('Querying database') r = db.query(q) logging.info('Writing URLs to file') with open('pages_to_download.txt', 'w') as f: for row in r: f.write('{}\n'.format(row['loc']))
def get_main_page_html(): import splinter import os import time ff_profile = os.getcwd() + "\\lib\\quick_firefox" browser = splinter.Browser('firefox', profile=ff_profile) for i in range(5): try: browser.visit( 'https://www.kickstarter.com/discover/advanced?sort=newest') break except: "Trying to load page again..." with codecs.open('output_before.html', 'wb', encoding='utf-8') as f: f.write(browser.html) num_reloads = 0 while browser.title.find("We're sorry, but something went wrong") != -1: time.sleep(2) browser.reload() num_reloads += 1 print "Oh no!" if num_reloads > 5: break # to make sure we don't load too many urls print "Getting projects in the big database" project_db = db_connections.get_fungrosencrantz_schema('kickstarter') all_project_urls = set( [x['url'] for x in project_db.query('select url from project')]) print "Getting projects in the intermediate database" intermediate_db = db_connections.get_intermediate_db() urls_to_scrape = set([ x['url'] for x in intermediate_db.query('select url from urls_to_scrape') ]) print "Starting download...." wait_time = 0 wait_step = 0.05 max_wait = 60 max_tries = 3 num_tries = 0 num_hrefs = 0 last_num_hrefs = 0 extra_pages = None while True: orig_html_length = len(browser.html) button = browser.find_by_xpath( '//div[@class="load_more"]/a[@role="button"]')[-1] button.click() time.sleep(wait_step) while True: project_hrefs = browser.find_by_xpath( '//h6[@class="project-title"]/a') num_hrefs = len(project_hrefs) if num_hrefs != last_num_hrefs: break print "Waiting..." time.sleep(wait_step) wait_time += wait_step if wait_time >= max_wait: break if wait_time >= max_wait: num_tries += 1 if num_tries >= max_tries: break else: num_tries = 0 last_num_hrefs = num_hrefs print "{0} projects loaded".format(num_hrefs) last_href = project_hrefs[-1]['href'] last_href = last_href.replace("?ref=newest", "") if last_href in all_project_urls or last_href in urls_to_scrape: if extra_pages is None: extra_pages = 3 else: extra_pages += -1 if extra_pages == 0: break page_source = browser.html browser.quit() return page_source
import sys if '../' not in sys.path: sys.path.insert(0, '../') from unused_scripts import db_connections db1 = db_connections.get_fungrosencrantz_schema('TestDBforScott') db2 = db_connections.get_fungrosencrantz_schema('Kiva') for table in db1.tables: original_count = db1.query( 'select count(*) as cnt from {}'.format(table)).next()['cnt'] new_count = db2.query( 'select count(*) as cnt from {}'.format(table)).next()['cnt'] if original_count != new_count: print(table, original_count, new_count)
with open('../{}_desc_stats.latex'.format(outfile_name), 'w') as f: f.write(desc_df.to_latex()) desc_df.to_csv('../{}_desc_stats.csv'.format(outfile_name)) print(desc_df.to_latex()) print(desc_df) if __name__ == '__main__': from unused_scripts import db_connections import logging logging.basicConfig(level=logging.DEBUG) os.chdir('kickstarter') db = db_connections.get_fungrosencrantz_schema('kickstarter') ''' get_descriptive_stats_latex(db=db, table=""" (select amount_raised, donation_count, IFNULL(team_members, 1) as team_members from ( SELECT fundraiser.url, fundraiser.total_raised AS amount_raised, IFNULL(donation_count, 0) AS donation_count FROM fundraiser LEFT JOIN (SELECT url, count(*) AS donation_count FROM donation GROUP BY url) AS t1 ON fundraiser.url = t1.url) as t2
def run(): logging.basicConfig(level=logging.DEBUG) os.chdir('kickstarter') db = db_connections.get_fungrosencrantz_schema('kickstarter') # get_kiva_stats(db) get_kickstarter_stats(db)
def output_graphs(category=None, category_in_title=None): db = db_connections.get_fungrosencrantz_schema('kickstarter') fig = plt.figure(figsize=(8, 6)) if category is None: file_name_prepend = 'plot_all_' category_requirement = ' is not null ' else: file_name_prepend = 'plot_{}_'.format(category.replace(' ', '').replace('&', '_and_')) category_requirement = ' = "{}"'.format(category) query = """ SELECT concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`, sum(new_backers) AS `new_backers`, sum(repeat_backers) AS `repeat_backers`, std(new_backers) AS `new_backers_std`, std(repeat_backers) AS `repeat_backers_std` FROM project JOIN location ON project.location_slug = location.slug WHERE location.country = 'US' AND new_backers IS NOT NULL AND project.repeat_backers IS NOT NULL AND NOT (year(start_date) = year(now()) AND quarter(start_date) = quarter(now())) AND category {} GROUP BY year(start_date), quarter(start_date); """.format(category_requirement) df = pd.read_sql(query, db.engine) plot_quarterly_stacked_df(fig=fig, df=df, filename=file_name_prepend + 'backers_split.png', xlabel='Starting Quarter', ylabel='Number of Backers', title='Number of Backers by Quarter{}'.format( '' if category_in_title is None else " ({}) ".format(category_in_title)), x_series=df['quarter'], y_series=df['repeat_backers'], y_series_2=df['new_backers'], y_series_labels=['Repeat Backers', 'New Backers']) plot_num_projects_per_day(db=db, fig=fig, file_name_prepend=file_name_prepend, category_requirement=category_requirement, category=category_in_title, ) query = """ select amount_pledged, currency, start_date, concat(year(start_date), 'Q', quarter(start_date)) as `quarter` from project JOIN location ON project.location_slug = location.slug where location.country = 'US' AND start_date is not null and amount_pledged is not null and category {} AND NOT (year(start_date) = year(now()) and quarter(start_date) = quarter(now())) order by rand(); """.format(category_requirement) pledged_df = pd.read_sql(query, db.engine, parse_dates=['start_date']) useful_functions.convert_currency( pledged_df, currency_column='currency', money_column='amount_pledged', date_column='start_date') del pledged_df['currency'] del pledged_df['start_date'] df = pledged_df.groupby(by=['quarter']).sum() plot_quarterly_df(fig=fig, df=df, filename=file_name_prepend + 'pledged_total.png', xlabel='Starting Quarter', ylabel='Amount Pledged (USD)', title='Total Amount Pledged by Quarter{}'.format( '' if category_in_title is None else " ({}) ".format(category_in_title)), x_series=df.index, y_series=df['amount_pledged']) df = pledged_df.groupby(by=['quarter']).mean() plot_quarterly_df(fig=fig, df=df, filename=file_name_prepend + 'pledged_avg.png', xlabel='Starting Quarter', ylabel='Amount Pledged (USD)', title='Average Amount Pledged per Project by Quarter{}'.format( '' if category_in_title is None else " ({}) ".format(category_in_title)), x_series=df.index, y_series=df['amount_pledged']) query = """ SELECT concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`, avg(backer_count) AS `mean_backers`, sum(backer_count) AS `total_backers` FROM project JOIN location ON project.location_slug = location.slug WHERE location.country = 'US' AND start_date IS NOT NULL and category {} AND NOT (year(start_date) = year(now()) and quarter(start_date) = quarter(now())) GROUP BY year(start_date), quarter(start_date) """.format(category_requirement) df = pd.read_sql(query, db.engine) plot_quarterly_df(fig=fig, df=df, filename=file_name_prepend + 'backers_avg.png', xlabel='Starting Quarter', ylabel='Number of Backers', title='Average Number of Backers per Project by Quarter{}'.format( '' if category_in_title is None else " ({}) ".format(category_in_title)), x_series=df['quarter'], y_series=df['mean_backers']) plot_quarterly_df(fig=fig, df=df, filename=file_name_prepend + 'backers_total.png', xlabel='Starting Quarter', ylabel='Number of Backers', title='Total Number of Backers by Quarter{}'.format( '' if category_in_title is None else " ({}) ".format(category_in_title)), x_series=df['quarter'], y_series=df['total_backers']) query = """ SELECT concat(year(start_date), 'Q', quarter(start_date)) AS `quarter`, count(*) `count` FROM project JOIN location ON project.location_slug = location.slug WHERE location.country = 'US' AND start_date is not null and category {} AND NOT (year(start_date) = year(now()) and quarter(start_date) = quarter(now())) GROUP BY year(start_date), quarter(start_date) """.format(category_requirement) df = pd.read_sql(query, db.engine) plot_quarterly_df(fig=fig, df=df, filename=file_name_prepend + 'projects_total_quarterly.png', xlabel='Starting Quarter', ylabel='Number of Projects', title='Number of Projects Started Each Quarter{}'.format( '' if category_in_title is None else " ({}) ".format(category_in_title)), x_series=df['quarter'], y_series=df['count'])
def scrape_new_html(limit=20, url_comment_id=dict(), test_url=None): theta_conn = db_connections.get_theta_postgres_db() theta_cur = theta_conn.cursor() theta_cur.execute('set search_path = "backend"') if test_url is not None: theta_cur.execute( "select loc, html from html where loc = '{}';".format(test_url)) else: theta_cur.execute(""" SELECT html.loc , html FROM html JOIN sitemap ON html.loc = sitemap.loc WHERE (last_scrape IS NULL OR lastmod > last_scrape) AND html IS NOT NULL --AND NOT ('fundraisers' = ANY (categories)) AND NOT ('static' = ALL (categories) OR html.loc = 'https://www.crowdrise.com') limit {};""".format(limit)) html_data = theta_cur.fetchall() if len(html_data) == 0: theta_cur.close() theta_conn.close() return True all_data = dict(fundraiser=[], user=[], charity=[], event=[], special_user=[], front_page_redirect=[], user_project=[], charity_event=[], team=[], donation=[]) scraped_urls = [] for url, html in html_data: scraped_urls.append(url) try: # root = lxml.html.fromstring(lxml.html.tostring(lxml.html.fromstring(html.encode('latin1'))).decode('utf8')) try: root = lxml.html.fromstring( html.encode('latin1').decode('utf8')) except UnicodeDecodeError: logging.warning( 'unicode decode error for url "{}"'.format(url)) theta_conn, theta_cur = keep_theta_conn_alive( theta_conn, theta_cur) theta_cur.execute( 'insert into html_bad_encoding values (%s) on CONFLICT DO NOTHING ;', [(url, )]) theta_conn.commit() root = lxml.html.fromstring( html.encode('latin1').decode('utf8', errors='ignore')) try: page_type = CrowdriseScraper.get_page_type(root) except NotImplementedError: theta_conn, theta_cur = keep_theta_conn_alive( theta_conn, theta_cur) theta_cur.executemany( "insert into unknown_page_type values (%s) on CONFLICT DO NOTHING;", [(url, )]) theta_conn.commit() continue page_data = CrowdriseScraper.get_crowdrise_data( page_type, root, url, latest_comment_id=url_comment_id.get(url)) if page_data is not None: # file_data['file_path'] = cur_file_name page_data['url'] = url page_data['true_url'] = root.xpath( '//meta[@property="og:url"]')[0].attrib['content'].replace( 'https://', '').replace('http://', '') page_data['base_true_url'] = None # file_data['last_scrape'] = time.gmtime(os.path.getmtime(cur_file_name)) # handle data that requires its own table - eg the fundraisers each user has if 'projects' in page_data.keys(): projects = page_data.pop('projects') all_data['user_project'] += [{ 'username': page_data['username'], 'project': 'www.crowdrise.com' + x } for x in projects] if 'events' in page_data.keys(): events = page_data.pop('events') all_data['charity_event'] += [{ 'charity': page_data['url'], 'event': 'www.crowdrise.com' + x } for x in events] if 'team_members' in page_data.keys(): team_members = page_data.pop('team_members') all_data['team'] += team_members if 'donations' in page_data.keys(): donations = page_data.pop('donations') all_data['donation'] += donations all_data[page_type].append(page_data) except: print('failed on url "{}"'.format(url)) logging.error('failed on url "{}"'.format(url)) raise all_data['user_project'] = [ x for x in all_data['user_project'] if re.match(CROWDRISE_URL_RE, 'https://' + x['project']) ] db = db_connections.get_fungrosencrantz_schema('crowdrise') db_connections.multi_table_upload(data=all_data, db=db, ensure=True, process_num=None, chunk_size=3000) scrape_time = time.time() # update table with new entries db.query('truncate table _recently_updated') db.executable.execute( 'insert ignore into _recently_updated values (%s, %s)', [(x, scrape_time) for x in scraped_urls]) db.executable.execute(""" replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) SELECT fundraiser.url, CASE WHEN fundraiser_url IS NULL # individual fundraiser THEN fundraiser.username ELSE # team fundraiser '' # give team total raised for fundraiser, then use `team` to give individual contributions END, coalesce(team_total_raised, total_raised), NULL, _recently_updated.last_scrape_unix, 'fundraiser' FROM fundraiser join _recently_updated on _recently_updated.url = fundraiser.url LEFT JOIN team ON fundraiser.url = team.fundraiser_url GROUP BY fundraiser.url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select fundraiser_url, username, amount_raised, goal, _recently_updated.last_scrape_unix, 'team' from team join _recently_updated on _recently_updated.url = team.fundraiser_url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select charity.url, '', money_raised, null, _recently_updated.last_scrape_unix, 'charity' from charity join _recently_updated on _recently_updated.url = charity.url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select event.url, '', amount_raised, goal, _recently_updated.last_scrape_unix, 'event' from event join _recently_updated on _recently_updated.url = event.url; replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type) select user.url, username, money_raised, null, _recently_updated.last_scrape_unix, 'user' from user join _recently_updated on _recently_updated.url = user.url; """) q = """ update html set last_scrape = to_timestamp({}) where loc in ({});""".format( scrape_time, ", ".join(["'" + x + "'" for x in scraped_urls])) theta_conn, theta_cur = keep_theta_conn_alive(theta_conn, theta_cur) theta_cur.execute(q) if test_url is None and limit != 0: theta_conn.commit() theta_cur.close() theta_conn.close() if len(html_data) < limit or test_url is not None: return False else: return True
def get_aggregate_funding_trend_df(status='successful', min_data_points=4, limit=None): db = db_connections.get_fungrosencrantz_schema(schema='kickstarter') sql = """ SELECT projectid, funding_trend.amount_pledged, project.currency, funding_trend.update_count, funding_trend.comment_count, funding_trend.backer_count, DATEDIFF(date_added, start_date) AS `day`, goal FROM funding_trend JOIN project ON funding_trend.projectid = project.id WHERE project.status = "{}" ORDER BY projectid {} """.format(status, 'limit {}'.format(limit) if limit is not None else "") logging.info('Acquiring data') complete_dataframe = pd.read_sql(sql, db.executable) logging.info('Converting currencies to USD') convert_kickstarter_currency_df_to_usd(complete_dataframe) complete_dataframe['percent_of_goal'] = np.divide( complete_dataframe['amount_pledged'], complete_dataframe['goal']) * 100 del complete_dataframe['currency'] # add interpolation on the various columns per project_id logging.info('Interpolating...') grouped = complete_dataframe.groupby('projectid') filled_df = None for projectid, sub_df in grouped: if 0 not in sub_df['day'].values: day_0_df = pd.DataFrame( [[projectid, 0, 0, 0, 0, 0, sub_df['goal'].values[0], 0]], columns=sub_df.columns) sub_df = day_0_df.append(sub_df) if len(sub_df) <= min_data_points: continue interp_functions = dict( percent_of_goal=interp1d(sub_df['day'], sub_df['percent_of_goal']), amount_pledged=interp1d(sub_df['day'], sub_df['amount_pledged']), update_count=interp1d(sub_df['day'], sub_df['update_count']), comment_count=interp1d(sub_df['day'], sub_df['comment_count']), backer_count=interp1d(sub_df['day'], sub_df['backer_count'])) data = dict() num_days = sub_df['day'].max() + 1 days_to_evaluate_for = range(num_days) data['projectid'] = pd.Series([projectid] * num_days) data['amount_pledged'] = pd.Series( interp_functions['amount_pledged'](days_to_evaluate_for)) data['update_count'] = pd.Series([ int(round(x)) for x in interp_functions['update_count'](days_to_evaluate_for) ]) data['comment_count'] = pd.Series([ int(round(x)) for x in interp_functions['comment_count'](days_to_evaluate_for) ]) data['backer_count'] = pd.Series([ int(round(x)) for x in interp_functions['backer_count'](days_to_evaluate_for) ]) data['day'] = pd.Series(days_to_evaluate_for) data['goal'] = pd.Series([sub_df['goal'].values[0]] * num_days) data['percent_of_goal'] = pd.Series( interp_functions['percent_of_goal'](days_to_evaluate_for)) partial_filled_df = pd.DataFrame.from_dict(data=data) if filled_df is None: filled_df = partial_filled_df else: filled_df = filled_df.append(partial_filled_df) del filled_df['projectid'] del filled_df['goal'] logging.info('Grouping and taking median') grouped = filled_df.groupby('day') grouped_df = grouped.median() grouped_df['count'] = grouped.size() return grouped_df '''
def run(): db = db_connections.get_fungrosencrantz_schema('Kiva') api = KivaAPI() upload_new_loans_and_loan_lenders(db, api, from_file='missing_loan_data.json', update=False)