def process_podcast(): """ Gets and fully proccesses one podcast (in iTunes) Parameters None Returns None """ conn = db.connect_db() podcast_name, podcast_url = db.get_unprocessed_podcast(conn, mark_in_progress=True) podcast_id = get_podcast_id(podcast_url) with open("scrape.log", "a") as log_file: podcast_dict, podcast_data, page_data = process_metadata(podcast_name, podcast_url, podcast_id, conn, log_file) if podcast_data == False or podcast_dict == False or page_data == False: time.sleep(exponnorm.rvs(24, loc=200, scale=1, size=1)) log_file.write("{} | failed on {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), podcast_name)) return None total_reviews = podcast_dict["review_count"] time.sleep(exponnorm.rvs(2, loc=22, scale=1, size=1)) process_reviews(podcast_id, podcast_name, total_reviews, conn, log_file) process_episodes(podcast_data, page_data, podcast_id, conn, log_file) log_file.write("{} | success on {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), podcast_name)) print("{} | success on {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), podcast_name)) time.sleep(exponnorm.rvs(2, loc=22, scale=1, size=1)) db.mark_as_itunes(conn, podcast_url)
def main(): conn, cursor = db.connect_db() dir_check = os.path.exists("./artwork") if not dir_check: os.mkdir("./artwork") with open("art.log", "a") as logfile: while True: success, podcast_id, trace = get_art(conn, cursor) f_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) if not success: if podcast_id == "db_fail": logfile.write(f"{f_time} | {trace}\n") print(f"{f_time} | {trace}") break else: logfile.write(f"{f_time} | {trace}\n") print(f"{f_time} | {trace}") time.sleep(exponnorm.rvs(2, loc=20, scale=1, size=1)) cursor.execute( "update podcasts " "set processed = 'artwork' " "where podcast_id = (%s)", [podcast_id]) conn.commit() logfile.write(f"{f_time} | Success on {podcast_id}\n") print(f"{f_time} | Success on {podcast_id}") time.sleep(exponnorm.rvs(2, loc=20, scale=1, size=1))
def sample_exgauss(size=default_size, param_dict=param_dict): out = exponnorm.rvs(param_dict['K'], param_dict['loc'], param_dict['scale'], size=size) n_negatives = np.sum(out < 0) while n_negatives > 0: out[out < 0] = sample_exgauss(n_negatives, param_dict=param_dict) n_negatives = np.sum(out < 0) return out
def generate_uncertainties(N, dist='Gamma', rseed=None): """ This function generates a uncertainties for the white noise component in the synthetic light curve. Parameters --------- N: positive integer Lenght of the returned uncertainty vector dist: {'EMG', 'Gamma'} Probability density function (PDF) used to generate the uncertainties rseed: Seed for the random number generator Returns ------- s: ndarray Vector containing the uncertainties expected_s_2: float Expectation of the square of s computed analytically """ np.random.seed(rseed) #print(dist) if dist == 'EMG': # Exponential modified Gaussian # the mean of a EMG rv is mu + 1/(K*sigma) # the variance of a EMG rv is sigma**2 + 1/(K*sigma)**2 K = 1.824328605481941 sigma = 0.05 * 0.068768312946785953 mu = 0.05 * 0.87452567616276777 # IMPORTANT NOTE # These parameters were obtained after fitting uncertainties # coming from 10,000 light curves of the VVV survey expected_s_2 = sigma**2 + mu**2 + 2 * K * mu * sigma + 2 * K**2 * sigma**2 s = exponnorm.rvs(K, loc=mu, scale=sigma, size=N) elif dist == 'Gamma': # The mean of a gamma rv is k*sigma # The variance of a gamma rv is k*sigma**2 k = 3.0 sigma = 0.05 / k # mean=0.05, var=0.05**2/k s = gamma.rvs(k, loc=0.0, scale=sigma, size=N) expected_s_2 = k * (1 + k) * sigma**2 return s, expected_s_2
def generate_uncertainties(N, dist='Gamma', rseed=None): """ This function generates a uncertainties for the white noise component in the synthetic light curve. Parameters --------- N: positive integer Lenght of the returned uncertainty vector dist: {'EMG', 'Gamma'} Probability density function (PDF) used to generate the uncertainties rseed: Seed for the random number generator Returns ------- s: ndarray Vector containing the uncertainties expected_s_2: float Expectation of the square of s computed analytically """ np.random.seed(rseed) #print(dist) if dist == 'EMG': # Exponential modified Gaussian # the mean of a EMG rv is mu + 1/(K*sigma) # the variance of a EMG rv is sigma**2 + 1/(K*sigma)**2 K = 1.824328605481941 sigma = 0.05*0.068768312946785953 mu = 0.05*0.87452567616276777 # IMPORTANT NOTE # These parameters were obtained after fitting uncertainties # coming from 10,000 light curves of the VVV survey expected_s_2 = sigma**2 + mu**2 + 2*K*mu*sigma + 2*K**2*sigma**2 s = exponnorm.rvs(K, loc=mu, scale=sigma, size=N) elif dist == 'Gamma': # The mean of a gamma rv is k*sigma # The variance of a gamma rv is k*sigma**2 k = 3.0 sigma = 0.05/k # mean=0.05, var=0.05**2/k s = gamma.rvs(k, loc=0.0, scale=sigma, size=N) expected_s_2 = k*(1+k)*sigma**2 return s, expected_s_2
def request(url, podcast_name, headers, f): """ Returns request object if a successful request is made Parameters url (string): url to request podcast_name (string): podcast name for writing errors headers (dict): headers to use to make the request f (file object): log file for writing errors Returns response (requests response object) success (bool): True if successful, else False """ max_tries = 5 tries = 0 request = True while tries < max_tries: response = requests.get(url, headers=headers) if response.status_code == 200: return response, True elif response.status_code == 403: time.sleep(exponnorm.rvs(20, loc=220, scale=1, size=1)) tries += 1 elif response.status_code == 400: page_index += 500 print("Something went wrong with {}!! " "(Error Code 400)".format(podcast_name)) f.write("{}\n{}\n{}\n".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), podcast_name, response.text)) break return None, False else: print("Something went wrong with {}!!".format(podcast_name)) f.write("{}\n{}\n{}\n{}\n".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), podcast_name, response.status_code, response.text)) break return None, False return None, False
def process_reviews(podcast_id, podcast_name, total_reviews, conn, log_file): """ Wrapper function to process reviews of a podcast Parameters podcast_id (int): unique podcast identifier total_reviews (int): total number of reviews conn: active psycopg2 connection log_file (file object in writeable mode): log file to write errors max_episodes (int): max number episodes to add to the database Returns None """ current_index = 0 num_pages = math.ceil(total_reviews / 100) current_page = 1 while current_index < total_reviews: review_url = review_url_constructor(podcast_id, current_index, total_reviews) response, revreq_success = request(review_url, podcast_name, headers, log_file) if not revreq_success: fail_handler(podcast_name, "review request", log_file) break current_index += 100 reviews, review_success = process_podcast_request(response, 1, log_file) if not review_success: fail_hander(podcast_name, "review page data", log_file) for review in reviews: review_dict = parse_review(review, podcast_id) db.update_reviews(review_dict, conn) print("Success on page {} of {} on {}".format(current_page, num_pages, podcast_name)) current_page += 1 if current_index < total_reviews: time.sleep(exponnorm.rvs(2, loc=22, scale=1, size=1))
def process_stitcher_podcast(conn, cursor, log_file): """ Completely processes one podcast from stitcher Parameters conn, cursor: active psycopg2 objects log_file (writeable file object): file object to write errors headers (dict): headers to use for http requests Returns None """ headers = { "User-Agent": user_agent.generate_user_agent(os=None, navigator=None, platform=None, device_type="desktop") } stitcher_url, podcast_id = get_stitcher_url(conn, cursor) stitcher_page, request_success = request_stitcher_page( stitcher_url, headers, log_file) if not request_success: stitcher_fail_handler(conn, cursor, stitcher_url, "requesting page", log_file) time.sleep(exponnorm.rvs( 3, 20, 1, 1, )) return None stitcher_id, parse_success = parse_stitcher_page(stitcher_page, log_file) if not parse_success: stitcher_fail_handler(conn, cursor, stitcher_url, "parsing page", log_file) time.sleep(exponnorm.rvs( 3, 20, 1, 1, )) return None total_reviews = 100 page_index = 0 while page_index * 100 < total_reviews: reviews, page_index, total_reviews, review_success = get_stitcher_reviews( stitcher_id, headers, log_file, page_index=page_index) if not review_success: if reviews == "no_reviews": mark_as_stitcher(conn, cursor, podcast_id) return None else: stitcher_fail_handler(conn, cursor, stitcher_url, "parsing reviews", log_file) return None for review in reviews: review_dict = parse_stitcher_review(podcast_id, review) review_update_success = update_reviews_stitcher( review_dict, conn, cursor) if not review_update_success: stitcher_fail_handler(conn, cursor, stitcher_url, "updating reviews", log_file) total_pages = math.ceil(total_reviews / 99) print("Success on page {} of {} for {}".format(page_index, total_pages, podcast_id)) time.sleep(exponnorm.rvs( 3, 20, 1, 1, )) if review_update_success: mark_as_stitcher(conn, cursor, podcast_id)
def process_podcast(conn, log_file): """ Wrapper function to process a podcast Parameters conn: active psycopg2 connection log_file (writeable file object): log file to write errors Returns None """ cursor = conn.cursor() podcast_name, itunes_url = get_podcast_name(conn) google_url = google_url_constructor(podcast_name) google_result = google_request(google_url, headers) if google_result.status_code == 503: print("YOU'VE BEEN DISCOVERED!!!!") cursor.close() time.sleep(3600) return None elif google_result.status_code != 200: print("failure on {}".format(podcast_name)) log_file.write("failure on {}\n".format(podcast_name)) cursor.execute( "update stitcher set stitcher_url = 'problem' " "where itunes_url = (%s)", [itunes_url]) time.sleep(exponnorm.rvs(2, 45, 1, 1)) cursor.close() return None search_url, search_name, parse_success = parse_google_result(google_result) if not parse_success: if search_url == True: print("no results for {}".format(podcast_name)) log_file.write("no results for {}\n".format(podcast_name)) cursor.execute( "UPDATE stitcher SET search_name = 'no result', " "stitcher_url = 'no result' " "WHERE itunes_url = (%s)", [itunes_url]) conn.commit() cursor.close() time.sleep(exponnorm.rvs(2, 45, 1, 1)) return None else: print("failure on {}\n{}".format(podcast_name, google_result.text)) log_file.write("failure on {}\n{}".format(podcast_name, google_result.text)) cursor.execute( "update stitcher set stitcher_url = 'problem' " "where itunes_url = (%s)", [itunes_url]) conn.commit() cursor.close() time.sleep(exponnorm.rvs(2, 45, 1, 1)) return None success = update_db(conn, itunes_url, search_url, search_name) if success: print("success on {}".format(podcast_name)) log_file.write("success on {}".format(podcast_name)) cursor.close() time.sleep(exponnorm.rvs(2, 45, 1, 1)) else: print("failure on {}".format(podcast_name)) log_file.write("failure on {}".format(podcast_name)) cursor.close() time.sleep(exponnorm.rvs(2, 45, 1, 1))
x = np.linspace(exponnorm.ppf(0.01, K), exponnorm.ppf(0.99, K), 100) ax.plot(x, exponnorm.pdf(x, K), 'r-', lw=5, alpha=0.6, label='exponnorm pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = exponnorm(K) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = exponnorm.ppf([0.001, 0.5, 0.999], K) np.allclose([0.001, 0.5, 0.999], exponnorm.cdf(vals, K)) # True # Generate random numbers: r = exponnorm.rvs(K, size=1000) # And compare the histogram: ax.hist(r, density=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
def _h_p_i_rvs(mu, sigma, tau_p, N_i): """Helper function for definition of h_p_emg_rvs """ rvs = exponnorm.rvs(loc=mu, scale=sigma, K=tau_p / sigma, size=N_i) return rvs
def _h_m_i_rvs(mu, sigma, tau_m, N_i): """Helper function for definition of h_m_emg_rvs """ rvs = mu - exponnorm.rvs(loc=0, scale=sigma, K=tau_m / sigma, size=N_i) return rvs