예제 #1
0
def process_podcast():
    """
    Gets and fully proccesses one podcast (in iTunes)

    Parameters
    None

    Returns
    None
    """
    conn = db.connect_db()
    podcast_name, podcast_url = db.get_unprocessed_podcast(conn,
                                                           mark_in_progress=True)
    podcast_id = get_podcast_id(podcast_url)
    with open("scrape.log", "a") as log_file:
        podcast_dict, podcast_data, page_data = process_metadata(podcast_name,
                                                                 podcast_url,
                                                                 podcast_id,
                                                                 conn,
                                                                 log_file)
        if podcast_data == False or podcast_dict == False or page_data == False:
            time.sleep(exponnorm.rvs(24, loc=200, scale=1, size=1))
            log_file.write("{} | failed on {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime()), podcast_name))
            return None
        total_reviews = podcast_dict["review_count"]
        time.sleep(exponnorm.rvs(2, loc=22, scale=1, size=1))
        process_reviews(podcast_id, podcast_name, total_reviews, conn, log_file)
        process_episodes(podcast_data, page_data, podcast_id, conn, log_file)
        log_file.write("{} | success on {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S",
                                                  time.localtime()), podcast_name))
        print("{} | success on {}".format(time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime()), podcast_name))
        time.sleep(exponnorm.rvs(2, loc=22, scale=1, size=1))
    db.mark_as_itunes(conn, podcast_url)
예제 #2
0
def main():
    conn, cursor = db.connect_db()
    dir_check = os.path.exists("./artwork")
    if not dir_check:
        os.mkdir("./artwork")
    with open("art.log", "a") as logfile:
        while True:
            success, podcast_id, trace = get_art(conn, cursor)
            f_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            if not success:
                if podcast_id == "db_fail":
                    logfile.write(f"{f_time} | {trace}\n")
                    print(f"{f_time} | {trace}")
                    break
                else:
                    logfile.write(f"{f_time} | {trace}\n")
                    print(f"{f_time} | {trace}")
                    time.sleep(exponnorm.rvs(2, loc=20, scale=1, size=1))
            cursor.execute(
                "update podcasts "
                "set processed = 'artwork' "
                "where podcast_id = (%s)", [podcast_id])
            conn.commit()
            logfile.write(f"{f_time} | Success on {podcast_id}\n")
            print(f"{f_time} | Success on {podcast_id}")
            time.sleep(exponnorm.rvs(2, loc=20, scale=1, size=1))
 def sample_exgauss(size=default_size, param_dict=param_dict):
     out = exponnorm.rvs(param_dict['K'],
                         param_dict['loc'],
                         param_dict['scale'],
                         size=size)
     n_negatives = np.sum(out < 0)
     while n_negatives > 0:
         out[out < 0] = sample_exgauss(n_negatives, param_dict=param_dict)
         n_negatives = np.sum(out < 0)
     return out
예제 #4
0
def generate_uncertainties(N, dist='Gamma', rseed=None):
    """
    This function generates a uncertainties for the white noise component
    in the synthetic light curve. 
    
    Parameters
    ---------
    N: positive integer
        Lenght of the returned uncertainty vector
    dist: {'EMG', 'Gamma'}
        Probability density function (PDF) used to generate the 
        uncertainties
    rseed:
        Seed for the random number generator
        
    Returns
    -------
    s: ndarray
        Vector containing the uncertainties
    expected_s_2: float
        Expectation of the square of s computed analytically
        
    """
    np.random.seed(rseed)
    #print(dist)
    if dist == 'EMG':  # Exponential modified Gaussian
        # the mean of a EMG rv is mu + 1/(K*sigma)
        # the variance of a EMG rv is sigma**2 + 1/(K*sigma)**2
        K = 1.824328605481941
        sigma = 0.05 * 0.068768312946785953
        mu = 0.05 * 0.87452567616276777
        # IMPORTANT NOTE
        # These parameters were obtained after fitting uncertainties
        # coming from 10,000 light curves of the VVV survey
        expected_s_2 = sigma**2 + mu**2 + 2 * K * mu * sigma + 2 * K**2 * sigma**2
        s = exponnorm.rvs(K, loc=mu, scale=sigma, size=N)
    elif dist == 'Gamma':
        # The mean of a gamma rv is k*sigma
        # The variance of a gamma rv is k*sigma**2
        k = 3.0
        sigma = 0.05 / k  #  mean=0.05, var=0.05**2/k
        s = gamma.rvs(k, loc=0.0, scale=sigma, size=N)
        expected_s_2 = k * (1 + k) * sigma**2
    return s, expected_s_2
예제 #5
0
파일: generator.py 프로젝트: phuijse/P4J
def generate_uncertainties(N, dist='Gamma', rseed=None):
    """
    This function generates a uncertainties for the white noise component
    in the synthetic light curve. 
    
    Parameters
    ---------
    N: positive integer
        Lenght of the returned uncertainty vector
    dist: {'EMG', 'Gamma'}
        Probability density function (PDF) used to generate the 
        uncertainties
    rseed:
        Seed for the random number generator
        
    Returns
    -------
    s: ndarray
        Vector containing the uncertainties
    expected_s_2: float
        Expectation of the square of s computed analytically
        
    """
    np.random.seed(rseed)  
    #print(dist)
    if dist == 'EMG':  # Exponential modified Gaussian
        # the mean of a EMG rv is mu + 1/(K*sigma)
        # the variance of a EMG rv is sigma**2 + 1/(K*sigma)**2
        K = 1.824328605481941
        sigma = 0.05*0.068768312946785953
        mu = 0.05*0.87452567616276777
        # IMPORTANT NOTE
        # These parameters were obtained after fitting uncertainties
        # coming from 10,000 light curves of the VVV survey
        expected_s_2 = sigma**2 + mu**2 + 2*K*mu*sigma + 2*K**2*sigma**2 
        s = exponnorm.rvs(K, loc=mu, scale=sigma, size=N)
    elif dist == 'Gamma':
        # The mean of a gamma rv is k*sigma
        # The variance of a gamma rv is k*sigma**2
        k = 3.0
        sigma = 0.05/k  #  mean=0.05, var=0.05**2/k
        s = gamma.rvs(k, loc=0.0, scale=sigma, size=N)
        expected_s_2 = k*(1+k)*sigma**2  
    return s, expected_s_2
예제 #6
0
def request(url, podcast_name, headers, f):
    """
    Returns request object if a successful request is made

    Parameters
    url (string): url to request
    podcast_name (string): podcast name for writing errors
    headers (dict): headers to use to make the request
    f (file object): log file for writing errors

    Returns
    response (requests response object)
    success (bool): True if successful, else False
    """
    max_tries = 5
    tries = 0
    request = True
    while tries < max_tries:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response, True
        elif response.status_code == 403:
            time.sleep(exponnorm.rvs(20, loc=220, scale=1, size=1))
            tries += 1
        elif response.status_code == 400:
            page_index += 500
            print("Something went wrong with {}!! "
                  "(Error Code 400)".format(podcast_name))
            f.write("{}\n{}\n{}\n".format(time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime()),
                                                      podcast_name, response.text))
            break
            return None, False
        else:
            print("Something went wrong with {}!!".format(podcast_name))
            f.write("{}\n{}\n{}\n{}\n".format(time.strftime("%Y-%m-%d %H:%M:%S",
                                                      time.localtime()),
                                        podcast_name, response.status_code,
                                        response.text))
            break
            return None, False
    return None, False
예제 #7
0
def process_reviews(podcast_id, podcast_name, total_reviews, conn, log_file):
    """
    Wrapper function to process reviews of a podcast

    Parameters
    podcast_id (int): unique podcast identifier
    total_reviews (int): total number of reviews
    conn: active psycopg2 connection
    log_file (file object in writeable mode): log file to write errors
    max_episodes (int): max number episodes to add to the database

    Returns
    None
    """
    current_index = 0
    num_pages = math.ceil(total_reviews / 100)
    current_page = 1
    while current_index < total_reviews:
        review_url = review_url_constructor(podcast_id, current_index,
                                            total_reviews)
        response, revreq_success = request(review_url, podcast_name,
                                           headers, log_file)
        if not revreq_success:
            fail_handler(podcast_name, "review request", log_file)
            break
        current_index += 100
        reviews, review_success = process_podcast_request(response,
                                                         1, log_file)
        if not review_success:
            fail_hander(podcast_name, "review page data", log_file)

        for review in reviews:
            review_dict = parse_review(review, podcast_id)
            db.update_reviews(review_dict, conn)
        print("Success on page {} of {} on {}".format(current_page, num_pages,
                                                      podcast_name))
        current_page += 1
        if current_index < total_reviews:
            time.sleep(exponnorm.rvs(2, loc=22, scale=1, size=1))
예제 #8
0
def process_stitcher_podcast(conn, cursor, log_file):
    """
    Completely processes one podcast from stitcher

    Parameters
    conn, cursor: active psycopg2 objects
    log_file (writeable file object): file object to write errors
    headers (dict): headers to use for http requests

    Returns
    None
    """
    headers = {
        "User-Agent":
        user_agent.generate_user_agent(os=None,
                                       navigator=None,
                                       platform=None,
                                       device_type="desktop")
    }
    stitcher_url, podcast_id = get_stitcher_url(conn, cursor)
    stitcher_page, request_success = request_stitcher_page(
        stitcher_url, headers, log_file)
    if not request_success:
        stitcher_fail_handler(conn, cursor, stitcher_url, "requesting page",
                              log_file)
        time.sleep(exponnorm.rvs(
            3,
            20,
            1,
            1,
        ))
        return None
    stitcher_id, parse_success = parse_stitcher_page(stitcher_page, log_file)
    if not parse_success:
        stitcher_fail_handler(conn, cursor, stitcher_url, "parsing page",
                              log_file)
        time.sleep(exponnorm.rvs(
            3,
            20,
            1,
            1,
        ))
        return None
    total_reviews = 100
    page_index = 0
    while page_index * 100 < total_reviews:
        reviews, page_index, total_reviews, review_success = get_stitcher_reviews(
            stitcher_id, headers, log_file, page_index=page_index)
        if not review_success:
            if reviews == "no_reviews":
                mark_as_stitcher(conn, cursor, podcast_id)
                return None
            else:
                stitcher_fail_handler(conn, cursor, stitcher_url,
                                      "parsing reviews", log_file)
                return None
        for review in reviews:
            review_dict = parse_stitcher_review(podcast_id, review)
            review_update_success = update_reviews_stitcher(
                review_dict, conn, cursor)
            if not review_update_success:
                stitcher_fail_handler(conn, cursor, stitcher_url,
                                      "updating reviews", log_file)
        total_pages = math.ceil(total_reviews / 99)
        print("Success on page {} of {} for {}".format(page_index, total_pages,
                                                       podcast_id))
        time.sleep(exponnorm.rvs(
            3,
            20,
            1,
            1,
        ))

    if review_update_success:
        mark_as_stitcher(conn, cursor, podcast_id)
예제 #9
0
def process_podcast(conn, log_file):
    """
    Wrapper function to process a podcast

    Parameters
    conn: active psycopg2 connection
    log_file (writeable file object): log file to write errors
    Returns
    None
    """
    cursor = conn.cursor()
    podcast_name, itunes_url = get_podcast_name(conn)
    google_url = google_url_constructor(podcast_name)
    google_result = google_request(google_url, headers)
    if google_result.status_code == 503:
        print("YOU'VE BEEN DISCOVERED!!!!")
        cursor.close()
        time.sleep(3600)
        return None
    elif google_result.status_code != 200:
        print("failure on {}".format(podcast_name))
        log_file.write("failure on {}\n".format(podcast_name))
        cursor.execute(
            "update stitcher set stitcher_url = 'problem' "
            "where itunes_url = (%s)", [itunes_url])
        time.sleep(exponnorm.rvs(2, 45, 1, 1))
        cursor.close()
        return None
    search_url, search_name, parse_success = parse_google_result(google_result)
    if not parse_success:
        if search_url == True:
            print("no results for {}".format(podcast_name))
            log_file.write("no results for {}\n".format(podcast_name))
            cursor.execute(
                "UPDATE stitcher SET search_name = 'no result', "
                "stitcher_url = 'no result' "
                "WHERE itunes_url = (%s)", [itunes_url])
            conn.commit()
            cursor.close()
            time.sleep(exponnorm.rvs(2, 45, 1, 1))
            return None
        else:
            print("failure on {}\n{}".format(podcast_name, google_result.text))
            log_file.write("failure on {}\n{}".format(podcast_name,
                                                      google_result.text))
            cursor.execute(
                "update stitcher set stitcher_url = 'problem' "
                "where itunes_url = (%s)", [itunes_url])
            conn.commit()
            cursor.close()
            time.sleep(exponnorm.rvs(2, 45, 1, 1))
            return None
    success = update_db(conn, itunes_url, search_url, search_name)
    if success:
        print("success on {}".format(podcast_name))
        log_file.write("success on {}".format(podcast_name))
        cursor.close()
        time.sleep(exponnorm.rvs(2, 45, 1, 1))
    else:
        print("failure on {}".format(podcast_name))
        log_file.write("failure on {}".format(podcast_name))
        cursor.close()
        time.sleep(exponnorm.rvs(2, 45, 1, 1))
예제 #10
0
x = np.linspace(exponnorm.ppf(0.01, K),
                exponnorm.ppf(0.99, K), 100)
ax.plot(x, exponnorm.pdf(x, K),
       'r-', lw=5, alpha=0.6, label='exponnorm pdf')

# Alternatively, the distribution object can be called (as a function)
# to fix the shape, location and scale parameters. This returns a "frozen"
# RV object holding the given parameters fixed.

# Freeze the distribution and display the frozen ``pdf``:

rv = exponnorm(K)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of ``cdf`` and ``ppf``:

vals = exponnorm.ppf([0.001, 0.5, 0.999], K)
np.allclose([0.001, 0.5, 0.999], exponnorm.cdf(vals, K))
# True

# Generate random numbers:

r = exponnorm.rvs(K, size=1000)

# And compare the histogram:

ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
예제 #11
0
def _h_p_i_rvs(mu, sigma, tau_p, N_i):
    """Helper function for definition of h_p_emg_rvs """
    rvs = exponnorm.rvs(loc=mu, scale=sigma, K=tau_p / sigma, size=N_i)
    return rvs
예제 #12
0
def _h_m_i_rvs(mu, sigma, tau_m, N_i):
    """Helper function for definition of h_m_emg_rvs """
    rvs = mu - exponnorm.rvs(loc=0, scale=sigma, K=tau_m / sigma, size=N_i)
    return rvs