Пример #1
0
def RunSync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url.
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n'
        .format(datetime.fromtimestamp(ts),
                'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))

    csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '.csv'
    csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '_wc_sync.csv'
    index = 1
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_dest_file, headers)

    f = csv.writer(open(csv_dest_file, "w"))  # Flush the old file
    f.writerow([
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ])
    with open(csv_src_file, mode='r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Headers are {", ".join(row)}')
                line_count += 1
            #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is
            if (len(row["Content"]) != 0):
                pc.printWarn(
                    "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    text_actions.clean_text(row["Title"] +
                                            row["WeightedContent"]) +
                    text_actions.getUrlString(
                        row["Content"]),  #add the url-words too
                    text_actions.clean_text(row["Content"]) +
                    text_actions.getUrlString(row["Content"])
                ]
                global WRITTEN_ENTRIES_SYNC
                WRITTEN_ENTRIES_SYNC += 1
                f = csv.writer(open(csv_dest_file, "a"))
                f.writerow(entry)
            #CHECK2(pre scraping): if(url == NULL)=>discard
            #CHECK3(pre scraping): if (row["title"]==NULL)=>discard
            elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)):
                pc.printWarn(
                    "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                time.sleep(0.0001)
                try:
                    # response = web_requests.hitGetWithRetry(url,TIMEOUT=10)
                    response = web_requests.hitGetWithRetry(
                        row["Url"], '', False, 2, 0.5, 60)
                    # if response.status_code == 200:
                    if response != -1:
                        # content = text_actions.contentfromhtml(response)  #NOTE: for sync
                        content = text_actions.contentfromhtml(
                            response.text)  #NOTE: for Async
                        urlstrings = text_actions.getUrlString(content)
                        content += urlstrings  #add the url-words too
                        # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too      #NOTE: for sync
                        weightedcontent = text_actions.weightedcontentfromhtml(
                            response.text
                        ) + row[
                            "Title"] + urlstrings  #add the url-words too        #NOTE: for async
                        line_count += 1
                        #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"])
                        if (len(content) == 0):
                            content = row["Title"]
                            weightedcontent = row["Title"]
                        else:
                            entry = [
                                row["ID"], row["SourceSite"],
                                row["ProcessingDate"], row["ProcessingEpoch"],
                                row["CreationDate"], row["Title"], row["Url"],
                                row["SourceTags"], row["ModelTags"],
                                row["NumUpvotes"], row["NumComments"],
                                row["PopI"],
                                text_actions.clean_text(weightedcontent),
                                text_actions.clean_text(content)
                            ]

                        f = csv.writer(open(csv_dest_file, "a"))
                        f.writerow(entry)
                        pc.printMsg(
                            "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}"
                            .format(
                                row["ID"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    else:
                        global SKIPPED_SYNC
                        SKIPPED_SYNC += 1
                        pc.printErr(
                            "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , "
                            .format(row["ID"], row["Url"]))
                except Exception as e:
                    global FAILED_SYNC
                    FAILED_SYNC += 1
                    pc.printErr(
                        "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}"
                        .format(row["ID"],
                                time.strftime("%H:%M:%S", time.localtime()),
                                e))
                    pass
    pc.printMsg(
        "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n"
        .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
    pc.printMsg(
        "|\tWRITTEN_ENTRIES_SYNC \t  | \t {} \t|".format(WRITTEN_ENTRIES_SYNC))
    pc.printMsg("|\tSKIPPED_SYNC          \t | \t {} \t|".format(SKIPPED_SYNC))
    pc.printMsg("|\tFAILED_SYNC           \t | \t {} \t|".format(FAILED_SYNC))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
Пример #2
0
def RunSync(ts):
    """
        NOTE: pdf pages taking a lot of time.Is it right to scrape them still?
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Opened > ---------------------------------------------\n"
    )

    blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4']

    q = "select * from " + wc_table + " where length(Content) = 0"
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    pc.printMsg(
        "\n\n \t ******************************* ITEMS FOR SYNC TO SCRAPE = {} ******************************\n\n"
        .format(len(rows)))
    conn.commit()
    for row in rows:
        t1 = time.time()
        if (len(row[13]) == 0):
            try:
                if row[6][-4:] not in blob_pages:
                    response = web_requests.hitGetWithRetry(
                        row[6], '', False, 2, 0.5, 30)
                    if response != -1:
                        gw.CS_SYNC_ITEM_SCRAPED += 1
                        res = response.text
                        row_list = list(row)
                        row_list[13] = res
                        row = tuple(row_list)

                        pc.printWarn(
                            "\t <ID = {}><src= {} > [SYNCED SCRAPED] Done................ \t\t TimeTaken = {} \t NOW: {} "
                            .format(
                                row[0], row[1], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
                        q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?'
                        d = (row[13], row[0], row[1])
                        c.execute(q, d)
                        conn.commit()
                        # pc.printSucc(" \t\t ============== <ID= {} ><{}> [SYNCED SCRAPED] INSERTED INTO TABLE =============== ".format(row[0],row[1]))
                    else:
                        gw.CS_SYNC_URL_UNREACHABLE += 1
                        pc.printErr(
                            "\t\tXXXXXXXXX [SYNCED SCRAPED]\t SKIPPING... <ID: {}> Totally unable to hit url even in SYNC: {}  \t\t TimeTaken = {} \t NOW: {} "
                            .format(
                                row[0], row[6], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
                else:
                    pc.printMsg(
                        "\t\txxxxx [SYNCED SCRAPED]\t... for ID: {} Found BLOB page SYNC. Will use title. URL: {}  \t\t TimeTaken = {} \t NOW: {} "
                        .format(row[0], row[6], round((time.time() - t1), 5),
                                time.strftime("%H:%M:%S", time.localtime())))
            except Exception as e:
                gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR += 1
                pc.printErr(
                    "\t XXXXXXXXXXXXXX [SYNC SCRAPING] XXXX ==>> <ID = {}><src= {} > NOW = {} , \t\t TimeTaken = {} ....Sync Scraping failed too.Will use Title for content... \n \t\t ERROR=> {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime()),
                            round((time.time() - t1), 5), e))
                # logging.error(traceback.format_exc())
                pass
    endTime = time.time()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** Sync Content Scraping is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Sync Content Scraping)', 'Notation(if any)', 'Value'])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)',
        gw.CS_SYNC_ITEM_SCRAPED
    ])
    table.add_row([
        'TIME TAKEN - SYNC CONTENT SCRAPING (min)', '-',
        round((endTime - startTime) / 60, 5)
    ])
    pc.printSucc(table)

    pc.printErr(
        "------------------------------------------ ERRORS-SYNC (Written nonetheless, chill) ------------------------------------------------\n"
    )
    table = PrettyTable(['Failures (Post Sync Content Scraping)', 'Value'])
    table.add_row(
        ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE])
    table.add_row([
        'COUNT. TRY/CATCHED EXCEP. - SYNC ',
        gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR
    ])
    pc.printErr(table)
    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Sync Scraping (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")
Пример #3
0
def run(ts):
    """
        Scrapes PH api for last 7 days & puts data in WP-DB.
            * Api supports daywaise only. So scrape for one day at a time
            * Link to documentation: https://api.producthunt.com/v1/docs/posts/posts_index_request_a_specific_day_with_the_%60day%60_parameter_(tech_category)
        * NOTE:
            * No threshold set on upvotes or comments rn.Maybe later?
            * API-Ratelimit: You can make up to 900 requests every 15 minutes, else gives `status 429` in response.If that happens, wait for 16 mins, then hit again.   
                * Retry 2 times; if failed nonetheless, skip!
            * Content = Tagline
            * URL: is the PH url only. Going to the product page & then finding the actual link is overkill
                * (this could also help later on getting their permission while monetizing)
            * Used self-retry logic. but check this package:: Read about requests.retries here: [doc](https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure), [stkofw](https://stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library?rq=1)
        Input: ts (format: 1598692058.887741)

        * ============= row is an array with indices: 
        (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12))
    """

    wp_db = 'dbs/wp.db'
    wp_table = 'wp_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started PH-scraper ................... => TABLE: {}\n'.
        format(datetime.fromtimestamp(ts), wp_table))
    conn = sqlite3.connect(wp_db, timeout=10)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()
    """
        here is how you add day to `ts`:

        from datetime import datetime, timedelta
        newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094
        newts.timestamp() # 1598783633.284871
        datetime.fromtimestamp(ts) #2020-08-29 17:15:32
        # get date from it: 
        datetime.fromtimestamp(ts).date() #2020-08-29
    """
    """ days_arr has last 7 days(including today's) (YYYY-MM-DD)date strings ; just the way PH's API needs
    """
    curr_date = str(int(ts))
    days_arr = [str(datetime.fromtimestamp(int(ts)).date())]  # '2020-08-29'

    for i in range(6):
        new_ts = datetime.fromtimestamp(int(curr_date)) + timedelta(days=-1)
        new_ts = new_ts.timestamp()
        curr_date = new_ts
        days_arr.append(str(datetime.fromtimestamp(int(new_ts)).date()))

    PH_REQ_HEADERS = {
        "Accept": "application/json",
        "Content-Type": "application/json",
        "Authorization": "Bearer " + vault.PH_ACCESS_TOKEN,
        "Host": "api.producthunt.com"
    }

    # csv_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wp-db/wp_table_'+str(int(ts))+'.csv'
    index = gw.WP_TOTAL_ENTRIES_YET + 1

    for date in days_arr:
        pc.printMsg(
            " ................. scraping for date =  {} .................\n".
            format(date))
        url = 'https://api.producthunt.com/v1/posts?day=' + date
        try:
            data = web_requests.hitGetWithRetry(url, PH_REQ_HEADERS, False, 2,
                                                5, 10)
            if (data == -1):
                pc.printErr(
                    "\t\txxxxxx Unable to hit {} after 2 retries.Skipping this date( {} ) xxxxxx\n"
                    .format(url, date))
            else:
                items_arr = json.loads(data.content)["posts"]
                for item in items_arr:
                    # print(json.dumps(item, indent = 4))
                    """ get all the tags attached along with the item """
                    source_tags = []
                    for tag in item["topics"]:
                        source_tags.append(tag["name"])
                    entry = [
                        index, "PH",
                        datetime.fromtimestamp(ts).date(),
                        int(ts),
                        date_conversion.PHDate(str(item["created_at"])),
                        item["name"], item["discussion_url"],
                        item["thumbnail"]["image_url"],
                        json.dumps(source_tags), item["votes_count"],
                        item["comments_count"], '', item["tagline"]
                    ]
                    # csv_functions.putToCsv(csv_file,entry)
                    c.execute(
                        'INSERT INTO ' + wp_table +
                        ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                    index = index + 1
                    gw.PH_TOTAL_ITEMS_GOT_YET += 1

        except Exception as e:
            pc.printErr(
                " \t xxxxxxxxxxxxx ERROR@PH_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n"
                .format(index, e))
            logging.error(traceback.format_exc())
            pass

        pc.printMsg("\t\t\t ====>> TOTAL_ENTRIES_YET = {}".format(
            gw.PH_TOTAL_ITEMS_GOT_YET))

    gw.WP_TOTAL_ENTRIES_YET += gw.PH_TOTAL_ITEMS_GOT_YET

    endTime = time.time()
    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** PH Url Scraping is Complete. TABLE: {} ******************"
        .format(wp_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post PH URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by PH', gw.PH_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WP TABLE YET', gw.WP_TOTAL_ENTRIES_YET])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-PH (sec) ',
        round((endTime - startTime), 5)
    ])
    pc.printSucc(table)
    print("\n\n")
Пример #4
0
def run(ts):
    """
        Scrapes Algolia's HN api for last 7 days & puts data in WC-DB.
            * max number of entries in algolia's single api call = 1000. So scrape for one day at a time
            * Link to documentation: https://hn.algolia.com/api
        Note:
            1. For AskHN entries put `` tag & separate threshold
            1. For ShowHN entries put `` tag & separate threshold
            1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments
        Input: ts (format: 1598692058.887741)
    """
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n")
    startTime = time.time()

    """
        here is how you add day to `ts`:

        from datetime import datetime, timedelta
        newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094
        newts.timestamp() # 1598783633.284871
        datetime.fromtimestamp(ts) #2020-08-29 17:15:32
    """

    """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings 
        TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format
    """
    ts_arr = [str(int(ts))]

    for i in range(6):
        new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1)
        new_ts = new_ts.timestamp()
        ts_arr.append(str(int(new_ts)))

    # for t in ts_arr:
    #     print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t))))

    index = gw.WC_TOTAL_URL_ENTRIES + 1

    for i in range(len(ts_arr)-1):
        startepoch = ts_arr[i]
        endepoch   = ts_arr[i+1]
        pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch))
        
        """ 
            getting stories(articles) with upvotes_count > upvotes_threshold 
            Also including:
                1. TellHN (<tech_discuss>)
                2. LaunchHN (<startup>)
        """
        pc.printWarn(" \t............. scraping stories .............")
        try:
            url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_story)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                sourceTag = ''
                content = ''
                sourceSite = 'HN'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                    if("Launch HN:" in item["title"]):                                    # 1. LaunchHN
                        sourceTag = 'startup'
                        sourceSite += '/launch'
                    if("Tell HN:" in item["title"]):                                      # 2. TellHN
                        sourceTag = 'tech_discuss'
                        sourceSite += '/tell'
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    sourceTag,
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

        """ getting ShowHNs """
        pc.printWarn("\t............. scraping showHNs .............")
        try:
            url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_show)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))
            
            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                content = ''
                sourceSite = 'HN/show'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'sideproj',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass


        """ getting AskHNs """

        pc.printWarn("\t............. scraping askHNs .............")
        try:
            url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_ask)
            res_size = json.loads(data.content)["nbHits"]

            pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]
            

            for item in items_arr:
                content = ''
                sourceSite = 'HN/ask'
                if(item["url"] is None): #as AskHNs dont have any url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'prog_query',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1
            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

    endTime = time.time()
    conn.commit()
    conn.close()
    gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n")

    pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)])
    pc.printSucc(table)
    print("\n\n")