def run(ts): """ This function does: * Creates the Tree Schema(germination) * Update Nodes(leaves & accumulated) with item_count(count) & avg_popi in schema iteself * Creates & updates th_table for given timestamp(ts) """ """ create the tree """ startTime = time.time() pc.printWarn( "\t\t . . . . . . . . . ....... Tree Germination in progress ....... . . . . . . . . .\n" ) root = TreeGermination() pc.printSucc( "\t\t <----------------------------------------------- Tree is Germinated ------------------------------------------------>\n" ) """ update leafnodes """ pc.printWarn( "\t\t . . . . . . . . . ....... Updating Leaf(tag) Nodes....... . . . . . . . . .\n" ) updateLeafNodes(ts) pc.printSucc( "\t\t <--------------------------------------------- Leaf Nodes updated ------------------------------------------------>\n" ) """ update parents """ pc.printWarn( "\t\t . . . . . . . . . ....... Updating Parent Nodes....... . . . . . . . . .\n" ) updateParentNodes(root) pc.printSucc( "\t\t <--------------------------------------------- Parent Nodes updated ------------------------------------------------>\n" ) """ NOTE: Print the Tree if you want """ tree_printer_pretty.print_tree(root) """ Create & Populate Tag Hotness(TH) Table""" pc.printWarn( "\t\t . . . . . . . . . ....... Creating & Populating TH Table ....... . . . . . . . . .\n" ) create_th(ts) update_th_mptt(root, 1, 1, ts) # update_th_mptt(root,left,level,ts) pc.printSucc( "\t\t <--------------------------------------------- TH Table Created & Populated ------------------------------------------------>\n" ) """ Update th_table for ItemIDs of wc_table """ pc.printWarn( "\t\t . . . . . . . . . ....... Updating th_table for ItemIDs from wc_table....... . . . . . . . . .\n" ) update_th_table_for_itemIDs(root, ts) pc.printSucc( "\t\t <--------------------------------------------- th_table now has ItemIDs(HN_IDs,R_IDs) from wc_table ------------------------------------------------>\n" ) endTime = time.time() th_table = 'th_' + str(int(ts)) pc.printWarn( "\t\t ---------------> TIME TAKEN FOR th_creating & th_updating@th (sec) => {} => TABLE: {}\n" .format(round((endTime - startTime), 5), th_table))
def update_modelTags(ts): """ runs on the table(wc_ts) in wc.db & update ModelTag """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started TaggerSimulator@wc ................... => TABLE: {}\n' .format(datetime.fromtimestamp(ts), wc_table)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < TaggerSimulator@wc : DB Connection Opened > ---------------------------------------------\n" ) pc.printWarn("\tRunning PopiCalculator for wc ....... \t NOW: {}".format( time.strftime("%H:%M:%S", time.localtime()))) pc.printWarn( "\t\t. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ." ) startTime = time.time() q = "select * from " + wc_table rows_head = c.execute(q) rows = rows_head.fetchall() for row in rows: """ ============= row is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ modelTags = [] #TODO: call actual Api here, when model is ready # pc.printMsg("\t <ID = {}><src= {} > [Tagger] Start................ ".format(row[0],row[1])) conf_arr = SimulatorApi(row[13], row[12]) for item in conf_arr: tag = item[0] conf = item[1] if (conf >= tags_threshold[tag]): modelTags.append(tag) # pc.printWarn(" \t\t\t\t => Added \t {} \t conf = {}".format(tag,conf)) modelTags = json.dumps(modelTags) query = 'update ' + wc_table + ' set ModelTags = ? where ID = ? and SourceSite = ?' data = (modelTags, row[0], row[1]) c.execute(query, data) endTime = time.time() conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < TaggerSimulator@wc: DB Connection Closed > ---------------------------------------------\n" ) pc.printWarn( "\t\t ---------------> TIME TAKEN FOR TaggerSimulator@wc(sec) => {} => TABLE: {}\n" .format(round((endTime - startTime), 5), wc_table))
def updateLeafNodes(ts): """ This is the query: select count(ID) from wc_1601292562 where ModelTags like "%prog_query%" or SourceTags like "%prog_query%"; """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started UpdateLeafNodes@wc ................... => TABLE: {}\n' .format(datetime.fromtimestamp(ts), wc_table)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < UpdateLeafNodes@wc : DB Connection Opened > ---------------------------------------------\n" ) pc.printWarn("\tRunning UpdateLeafNodes for wc ....... \t NOW: {}".format( time.strftime("%H:%M:%S", time.localtime()))) pc.printWarn( "\t\t. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ." ) startTime = time.time() for tag in tags_names: q = 'select count(ID) from ' + wc_table + ' where ModelTags like ? or SourceTags like ?' d = ( '%"{}"%'.format(tag), '%"{}"%'.format(tag), ) item_count = c.execute(q, d) item_count = c.fetchone()[0] q = 'select avg(PopI) from ' + wc_table + ' where ModelTags like ? or SourceTags like ?' avg_popi = c.execute(q, d) avg_popi = c.fetchone()[0] if avg_popi == None: avg_popi = 0 else: avg_popi = round(avg_popi, 10) curr_node = node_dict[tag] if curr_node.isTag: #update only if its a leaf curr_node.count = item_count curr_node.popi = avg_popi pc.printSucc( " \t\t\t..... Updated node: {} \t => c = {} , p = {}".format( curr_node.name, item_count, avg_popi)) endTime = time.time() conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < UpdateLeafNodes@wc: DB Connection Closed > ---------------------------------------------\n" ) pc.printWarn( "\t\t ---------------> TIME TAKEN FOR UpdateLeafNodes In Tree (sec) => {} \n" .format(round((endTime - startTime), 5)))
def run(ts): startTime = time.time() try: run_wc(ts) except Exception as e: pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e))) logging.error(traceback.format_exc()) pass try: run_wp(ts) except Exception as e: pc.printErr(" xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running PopICalculator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}".format(str(e))) logging.error(traceback.format_exc()) pass endTime = time.time() pc.printSucc("**************************** PopI Calculation is Done for wc & wp ********************************\n\n") pc.printWarn("| \t\t TIME TAKEN FOR PopICalculators-both \t\t | \t\t {} \t\t |".format(round((endTime - startTime),5))) pc.printSucc("*************************************************************************************************\n\n") pc.printSucc("\n\n***************************** PopI Calculation is Complete.************************") print("\n\n") table = PrettyTable(['Entity (Post PopI Calculation)', 'Value']) table.add_row(['TIME TAKEN FOR PopICalculators(wc & wp) (min)', round((endTime - startTime)/60,2)]) pc.printSucc(table) print("\n\n")
def RunAsync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url- ASYNCLY * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ # pc.printMsg('@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = 10, conn_lim =10]............ => FILENAME: {}\n'.format(datetime.fromtimestamp(ts),'dbs/wc-db/wc_table_'+str(int(ts))+'_wc.csv')) pc.printMsg( '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = 50, conn_lim =50]............ => FILENAME: {}\n' .format(datetime.fromtimestamp(ts), 'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) stratTime = time.time() csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '.csv' csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '_wwccc100-8.csv' # Run the async job asyncio.get_event_loop().run_until_complete( asyncio.ensure_future(asyncFetchAll(csv_src_file, csv_dest_file))) endTime = time.time() pc.printSucc( "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n \t\t ===========> TIME TAKEN = {}" .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv', (endTime - stratTime))) pc.printMsg( "\n------------------------------------------------------------------------" ) pc.printMsg("|\tENTRIES_TO_BE_WRITTEN \t | \t {} \t|".format( ENTRIES_TO_BE_WRITTEN)) pc.printMsg("|\tWRITTEN_ENTRIES_ASYNC_DIRECT \t | \t {} \t|".format( WRITTEN_ENTRIES_ASYNC_DIRECT)) pc.printMsg("|\tWRITTEN_ENTRIES_ASYNC_SCRAPED\t | \t {} \t|".format( WRITTEN_ENTRIES_ASYNC_SCRAPED)) pc.printMsg("|\tSKIPPED_ASYNC \t | \t {} \t|".format( SKIPPED_ASYNC)) pc.printMsg( "|\tFAILED_ASYNC \t | \t {} \t|".format(FAILED_ASYNC)) pc.printMsg( "--------------------------------------------------------------------------\n" )
async def fetchWithRetry(row, session): status = 400 retry_cnt = 3 sleep_time = 10 TIMEOUT = 60 while retry_cnt > 0 and status != 200: async with session.get(row["Url"], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) urlstrings = text_actions.getUrlString(row["Content"]) row["WeightedContent"] = text_actions.clean_text( text_actions.weightedcontentfromhtml(res) + row["Title"] + urlstrings) row["Content"] = text_actions.clean_text( text_actions.contentfromhtml(res) + urlstrings) if (len(row["Content"]) == 0): row["WeightedContent"] = text_actions.clean_text( row["Title"]) row["Content"] = text_actions.clean_text(row["Title"]) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- Unable to hit URL(ERR_CODE={}): {} Sleeping for {} Retries remaining = {} -------------x" .format(status, row["Url"], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , " .format(row["ID"], row["SourceSite"], row["Url"])) global SKIPPED_ASYNC SKIPPED_ASYNC += 1 return row
def create_th(ts): """ Just creates the th_table(Topic Hotness); if not exists already """ th_db = 'dbs/th.db' th_table = 'th_' + str(int(ts)) conn = sqlite3.connect(th_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < Create_th: DB Connection Opened > ---------------------------------------------\n" ) c.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'" .format(th_table)) if c.fetchone()[0] == 1: # table exists, flush away! c.execute("delete from {}".format(th_table)) else: # creting new table c.execute( "CREATE TABLE {} (ID, NodeName, LeftMptt, RightMptt, DepthLevel, ItemCount, AvgPopI, HN_IDs,R_IDs)" .format(th_table)) index = 1 q = 'INSERT INTO ' + th_table + ' VALUES (?,?,?,?,?,?,?,?,?)' for node_name in node_dict: query_from_tree = queryTreeNodeForCountNPopi(node_name) d = (index, node_name, -1, -1, 0, query_from_tree[0], query_from_tree[1], '', '') c.execute(q, d) index += 1 conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < Create_th: DB Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\t **************************************** TH Table Created: {} ******************************************************\n" .format(th_table))
def run(ts): startTime = time.time() try: update_modelTags(ts) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Tagger Simulator for wc table xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}" .format(str(e))) logging.error(traceback.format_exc()) pass endTime = time.time() pc.printSucc( "**************************** Tagger(Simulator) Run is Complete for wc **********************************************" ) pc.printWarn( "| \t\t TIME TAKEN FOR Tagger(Simulator) Run(sec) \t\t | \t\t {} \t\t |" .format(round((endTime - startTime), 5))) pc.printSucc( "***********************************************************************************************************************\n\n" )
def run(ts): """ Scrapes PH api for last 7 days & puts data in WP-DB. * Api supports daywaise only. So scrape for one day at a time * Link to documentation: https://api.producthunt.com/v1/docs/posts/posts_index_request_a_specific_day_with_the_%60day%60_parameter_(tech_category) * NOTE: * No threshold set on upvotes or comments rn.Maybe later? * API-Ratelimit: You can make up to 900 requests every 15 minutes, else gives `status 429` in response.If that happens, wait for 16 mins, then hit again. * Retry 2 times; if failed nonetheless, skip! * Content = Tagline * URL: is the PH url only. Going to the product page & then finding the actual link is overkill * (this could also help later on getting their permission while monetizing) * Used self-retry logic. but check this package:: Read about requests.retries here: [doc](https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure), [stkofw](https://stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library?rq=1) Input: ts (format: 1598692058.887741) * ============= row is an array with indices: (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6),ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12)) """ wp_db = 'dbs/wp.db' wp_table = 'wp_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started PH-scraper ................... => TABLE: {}\n'. format(datetime.fromtimestamp(ts), wp_table)) conn = sqlite3.connect(wp_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Opened > ---------------------------------------------\n" ) startTime = time.time() """ here is how you add day to `ts`: from datetime import datetime, timedelta newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094 newts.timestamp() # 1598783633.284871 datetime.fromtimestamp(ts) #2020-08-29 17:15:32 # get date from it: datetime.fromtimestamp(ts).date() #2020-08-29 """ """ days_arr has last 7 days(including today's) (YYYY-MM-DD)date strings ; just the way PH's API needs """ curr_date = str(int(ts)) days_arr = [str(datetime.fromtimestamp(int(ts)).date())] # '2020-08-29' for i in range(6): new_ts = datetime.fromtimestamp(int(curr_date)) + timedelta(days=-1) new_ts = new_ts.timestamp() curr_date = new_ts days_arr.append(str(datetime.fromtimestamp(int(new_ts)).date())) PH_REQ_HEADERS = { "Accept": "application/json", "Content-Type": "application/json", "Authorization": "Bearer " + vault.PH_ACCESS_TOKEN, "Host": "api.producthunt.com" } # csv_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wp-db/wp_table_'+str(int(ts))+'.csv' index = gw.WP_TOTAL_ENTRIES_YET + 1 for date in days_arr: pc.printMsg( " ................. scraping for date = {} .................\n". format(date)) url = 'https://api.producthunt.com/v1/posts?day=' + date try: data = web_requests.hitGetWithRetry(url, PH_REQ_HEADERS, False, 2, 5, 10) if (data == -1): pc.printErr( "\t\txxxxxx Unable to hit {} after 2 retries.Skipping this date( {} ) xxxxxx\n" .format(url, date)) else: items_arr = json.loads(data.content)["posts"] for item in items_arr: # print(json.dumps(item, indent = 4)) """ get all the tags attached along with the item """ source_tags = [] for tag in item["topics"]: source_tags.append(tag["name"]) entry = [ index, "PH", datetime.fromtimestamp(ts).date(), int(ts), date_conversion.PHDate(str(item["created_at"])), item["name"], item["discussion_url"], item["thumbnail"]["image_url"], json.dumps(source_tags), item["votes_count"], item["comments_count"], '', item["tagline"] ] # csv_functions.putToCsv(csv_file,entry) c.execute( 'INSERT INTO ' + wp_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index = index + 1 gw.PH_TOTAL_ITEMS_GOT_YET += 1 except Exception as e: pc.printErr( " \t xxxxxxxxxxxxx ERROR@PH_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n" .format(index, e)) logging.error(traceback.format_exc()) pass pc.printMsg("\t\t\t ====>> TOTAL_ENTRIES_YET = {}".format( gw.PH_TOTAL_ITEMS_GOT_YET)) gw.WP_TOTAL_ENTRIES_YET += gw.PH_TOTAL_ITEMS_GOT_YET endTime = time.time() conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < PH_SCRAPER: DB/wp Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** PH Url Scraping is Complete. TABLE: {} ******************" .format(wp_table)) print("\n\n") table = PrettyTable(['Entity (Post PH URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by PH', gw.PH_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WP TABLE YET', gw.WP_TOTAL_ENTRIES_YET]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-PH (sec) ', round((endTime - startTime), 5) ]) pc.printSucc(table) print("\n\n")
async def fetchWithRetry(row, session): """ Hits ulr(with retires): * if status == 200: return resposne ((raw)Content & (raw)WeightedContent in row) * if still unable to hit after retries: Content = Title , WeightedContent = Title INPUT: `row` is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ status = 400 retry_cnt = 2 sleep_time = 5 # TIMEOUT = ClientTimeout(total=20) TIMEOUT = 20 while retry_cnt > 0 and status != 200: async with session.get(row[6], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() # res = await response.content.read() # res = await text_actions.clean_text(str(response.content.read())) res = text_actions.clean_text(str(res)) # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore') #FIXME: not working status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()))) row_list = list(row) row_list[12] = text_actions.weightedcontentfromhtml(res) row_list[13] = text_actions.contentfromhtml(res) # for i in range(len(row_list)): # row_list[i] = row_list[i].decode("utf-8", "ignore") row = tuple(row_list) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) if (len(row[13]) == 0): global ERR_ASYNC_NO_CONTENT_IN_SCRAPING ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1 pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row[0], row[1])) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row[0], row[1], status, row[6][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} " .format(row[0], row[1], row[6])) global ERR_ASYNC_ON_URL_ERROR ERR_ASYNC_ON_URL_ERROR += 1 pc.printMsg( " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== " .format(row[0], row[1])) return row
async def fetchWithRetry(row, session, csv_out): """ Hits ulr(with retires): * if status == 200: put content into csv * if still unable to hit after retries: Content = Title , WeightedContent = Title """ status = 400 retry_cnt = 2 sleep_time = 10 TIMEOUT = 10 while retry_cnt > 0 and status != 200: async with session.get(row["Url"], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) urlstrings = text_actions.getUrlString(row["Content"]) row["WeightedContent"] = text_actions.weightedcontentfromhtml( res) + row["Title"] + urlstrings row["Content"] = text_actions.contentfromhtml(res) + urlstrings # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) if (len(row["Title"]) != 0): if len(row["Content"]) == 0: row["WeightedContent"] = row["Title"] row["Content"] = row["Title"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_SCRAPED WRITTEN_ENTRIES_ASYNC_SCRAPED += 1 pc.printMsg( " \t\t ============== [Scraped] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) else: global WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING += 1 pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING for <ID = {}><src= {} > As No Title xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row["ID"], row["SourceSite"])) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row["ID"], row["SourceSite"], status, row["Url"][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} " .format(row["ID"], row["SourceSite"], row["Url"])) if len(row["Content"]) == 0: row["WeightedContent"] = row["Title"] row["Content"] = row["Title"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR += 1 pc.printMsg( " \t\t\t ============== [Unreachable URL] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) return row
def RunAsync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url- ASYNCLY * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ global CONNTECTION_COUNT, SEMAPHORE_COUNT wc_table = 'wc_' + str(int(ts)) pc.printMsg( '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n' .format(datetime.fromtimestamp(ts), SEMAPHORE_COUNT, CONNTECTION_COUNT, wc_table)) stratTime = time.time() # csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'.csv' # csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_'+str(int(ts))+'_wc.csv' # Run the async job asyncio.get_event_loop().run_until_complete( asyncio.ensure_future(asyncFetchAll(ts))) endTime = time.time() pc.printSucc( "\n****************** (Async)Content Scraping is Complete , TABLE: {} ********************" .format(wc_table)) pc.printMsg( "\n--------------------------------------------------------------------------------------------------------------------------------" ) pc.printMsg( "|\t\t IN : Total Entries in Url-Scraped Output Table \t\t | \t\t {} \t\t|" .format(ENTRIES_TO_BE_WRITTEN)) pc.printMsg( "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_DIRECT(content exists) \t\t | \t\t {} \t\t|" .format(WRITTEN_ENTRIES_ASYNC_DIRECT)) pc.printMsg( "|\t\t OUT: WRITTEN_ENTRIES_ASYNC_SCRAPED(scraped entries) \t\t | \t\t {} \t\t|" .format(WRITTEN_ENTRIES_ASYNC_SCRAPED)) pc.printErr( "\n\n------------------ ERRORS In Scraping (Written nonetheless; counted in WRITTEN_ENTRIES_ASYNC_SCRAPED) --------------------------\n" ) pc.printMsg( "=================================================================================================================================" ) pc.printErr( "|\t\t ERR_ASYNC_NO_CONTENT_IN_SCRAPING(url hit;not content-written ) \t\t | \t\t {} \t\t|" .format(ERR_ASYNC_NO_CONTENT_IN_SCRAPING)) pc.printErr( "|\t\t ERR_ASYNC_ON_URL_ERROR(url not hit) \t\t | \t\t {} \t\t|" .format(ERR_ASYNC_ON_URL_ERROR)) pc.printErr( "|\t\t ERR_ASYNC_TRIED_ERR(other try/catch errs) \t\t | \t\t {} \t\t|" .format(ERR_ASYNC_TRIED_ERR)) pc.printMsg( "---------------------------------------------------------------------------------------------------------------------------------\n" ) pc.printWarn( '\t\t\t\t------------------------->>>>>> [ Semaphore Count = {}, Tcp connector limit ={} ]\n' .format(SEMAPHORE_COUNT, CONNTECTION_COUNT)) pc.printWarn( '\t\t\t\t------------------------->>>>>> [ Time Taken(sec) = {} ]\n'. format(int(endTime - stratTime)))
def run_wp(ts): """ runs on the table(wp_ts) in wp.db & updates PopI column in it """ wp_db = 'dbs/wp.db' wp_table = 'wp_' + str(int(ts)) pc.printSucc('@[{}] >>>>>> Started PopICalculator@wp ................... => TABLE: {}\n'.format(datetime.datetime.fromtimestamp(ts),wp_table)) conn = sqlite3.connect(wp_db, timeout=10) c = conn.cursor() pc.printMsg("\t -------------------------------------- < PopICalculator@wp : DB/wp Connection Opened > ---------------------------------------------\n") startTime = time.time() pc.printWarn("\tRunning PopiCalculator for wp ....... \t NOW: {}".format(time.strftime("%H:%M:%S", time.localtime()))) pc.printWarn("\t\t. . . . . . . . . . . . . ....... PopI Calculation for wp table Started ....... . . . . . . . . . . . . . . . . . . .") days = GetLastSevenDays(ts) """ Initialize both maps(weekly & daily): key = PopiItem, Value = (max_upvotes, max_comments) """ DailyMaxMap = collections.defaultdict(list) WeeklyMaxMap = collections.defaultdict(list) q = "select * from " + wp_table rows_head = c.execute(q) rows = rows_head.fetchall() for row in rows: """ * ============= row is an array with indices: (ID(0), SourceSite(1), ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5), Url(6), ThumbnailUrl(7),SourceTags(8),NumUpvotes(9),NumComments(10),PopI(11),Content(12)) """ popi_item_daily = PopiItem(row[1],row[4]) popi_item_weekly = PopiItem(row[1],row[2]) # for daily max if popi_item_daily in DailyMaxMap: max_upvotes_day = DailyMaxMap[popi_item_daily][0] max_comments_day = DailyMaxMap[popi_item_daily][1] else: q = "select max(NumUpvotes) from " + wp_table + " where SourceSite = ? and CreationDate = ?" d = (row[1],row[4]) max_upvotes_day = c.execute(q,d) max_upvotes_day = c.fetchone()[0] q = "select max(NumComments) from " + wp_table + " where SourceSite = ? and CreationDate = ?" max_comments_day = c.execute(q,d) max_comments_day = c.fetchone()[0] DailyMaxMap[popi_item_daily] = (max_upvotes_day,max_comments_day) # For weekly max if popi_item_weekly in WeeklyMaxMap: max_upvotes_week = WeeklyMaxMap[popi_item_daily][0] max_comments_week = WeeklyMaxMap[popi_item_daily][1] else: q = "select max(NumUpvotes) from " + wp_table + " where SourceSite = ? and ProcessingDate = ?" d = (row[1],row[2]) max_upvotes_week = c.execute(q,d) max_upvotes_week = c.fetchone()[0] q = "select max(NumComments) from " + wp_table + " where SourceSite = ? and ProcessingDate = ?" max_comments_week = c.execute(q,d) max_comments_week = c.fetchone()[0] WeeklyMaxMap[popi_item_weekly] = (max_upvotes_week,max_comments_week) popI = CalculatePopi(row[9],row[10],max_upvotes_day, max_comments_day, max_upvotes_week, max_comments_week,row[4],days[6],row[1]) # pc.printWarn(" \t\t [wc_popi calculation] <ID={}><Source={}> ...................... PopI = {}".format(row[0],row[1],popI)) # pc.printMsg("\t\t\t\t ........................ Updated PopI in wp_table..............") query = 'update ' + wp_table + ' set PopI = ? where ID = ? and SourceSite = ?' data = (popI,row[0],row[1]) c.execute(query,data) endTime = time.time() conn.commit() conn.close() pc.printMsg("\t -------------------------------------- < PopICalculator@wp: DB/wp Connection Closed > ---------------------------------------------\n") pc.printWarn("\t\t ---------------> TIME TAKEN FOR PopICalculator@wp => {} => TABLE: {}\n".format(round((endTime - startTime),5),wp_table))
def ContentFormatting(ts): """ Do: 0. Update Content & WeightedContent column for each row 1. get url_strings_content = getUrlString(row[13]) -> add it in weighted_content 2. do clean_text(row[13]) 2. do clean_text(row[12]) 3. clean text clean_text(row[5]) -> add it in weighted_content :: clean_text(row[12]) + " " + clean_title + " " + url_strings_content 4. if content col is still null; put title into it & in weightedContent too """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < Content Formatter: DB/wc Connection Opened > ---------------------------------------------\n" ) startTime = time.time() pc.printWarn("\tRunning ContentFormatter for wc ....... \t NOW: {}".format( time.strftime("%H:%M:%S", time.localtime()))) pc.printWarn( "\t\t. . . . . . . . . . . .......... Content Formatting Started @Content_Scraper ........... . . . . . . . . . . ." ) signal.signal(signal.SIGALRM, timeout_handler) # timeouts on few function calls, see below q = "select * from " + wc_table rows_head = c.execute(q) rows = rows_head.fetchall() conn.commit() for row in rows: t1 = time.time() row_list = list(row) if (len(row[13]) != 0): gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK += 1 clean_title = clean_text(row_list[5]) if len(row_list[13]) == 0: pc.printWarn( "\t\t\t\t --------- No content found on cleaning, using Title as Content :(" ) row_list[13] = clean_title row_list[12] = clean_title else: raw_content = row_list[13] signal.alarm(200) # Timeout of 200 sec on function call content = clean_title # if timeout happens, this will be the value of content try: content = text_actions.contentfromhtml(raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@ContentFromHtml ! ....using Title as content " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call clean_content = clean_title # if timeout happens, this will be the value of content try: clean_content = clean_text(content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@CleanText ! ....using Title as content " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call weighted_content = clean_title # if timeout happens, this will be the value of content try: weighted_content = text_actions.weightedcontentfromhtml( raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@WeightedContentFromHtml ! ....using Title as weightedcontent " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call clean_weighted_content = clean_title # if timeout happens, this will be the value of content try: clean_weighted_content = clean_text(weighted_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@CleanText ! ....using Title as weightedcontent " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call url_string_text = '' # if timeout happens, this will be the value of content try: url_string_text = getUrlString(raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on URL_STRING@getUrlString ! ....using empty str as url_string_text " .format(row[0], row[1])) # pc.printWarn(exc) pass row_list[13] = clean_content row_list[ 12] = clean_weighted_content + " " + url_string_text + " " + clean_title row = tuple(row_list) pc.printWarn( "\t <ID = {}><src= {} > [Content Formatting] Done................ \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) content = row[13] q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ? where ID = ? and SourceSite = ?' d = (row[13], row[12], row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-with content INSERTED INTO TABLE =============== ".format(row[0],row[1])) else: #No content gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT += 1 pc.printMsg( "\t <ID = {}><src= {} > [Content Formatting] No content.Using title finally................ \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) clean_title = clean_text(row_list[5]) content = clean_title q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ? where ID = ? and SourceSite = ?' d = (content, content, row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-without content INSERTED INTO TABLE =============== ".format(row[0],row[1])) endTime = time.time() conn.close() pc.printMsg( "\t -------------------------------------- < Content Formatter: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Content Formatting is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Success (Post Content Formatting)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK ]) table.add_row([ 'OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT ]) table.add_row([ 'TIME TAKEN - CONTENT FORMATTING (min)', '-', round((endTime - startTime) / 60, 5) ]) pc.printSucc(table) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for Content Formatting (min) = {} ]\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n")
def RunSync(ts): """ NOTE: pdf pages taking a lot of time.Is it right to scrape them still? """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Opened > ---------------------------------------------\n" ) blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4'] q = "select * from " + wc_table + " where length(Content) = 0" rows_head = c.execute(q) rows = rows_head.fetchall() pc.printMsg( "\n\n \t ******************************* ITEMS FOR SYNC TO SCRAPE = {} ******************************\n\n" .format(len(rows))) conn.commit() for row in rows: t1 = time.time() if (len(row[13]) == 0): try: if row[6][-4:] not in blob_pages: response = web_requests.hitGetWithRetry( row[6], '', False, 2, 0.5, 30) if response != -1: gw.CS_SYNC_ITEM_SCRAPED += 1 res = response.text row_list = list(row) row_list[13] = res row = tuple(row_list) pc.printWarn( "\t <ID = {}><src= {} > [SYNCED SCRAPED] Done................ \t\t TimeTaken = {} \t NOW: {} " .format( row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?' d = (row[13], row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [SYNCED SCRAPED] INSERTED INTO TABLE =============== ".format(row[0],row[1])) else: gw.CS_SYNC_URL_UNREACHABLE += 1 pc.printErr( "\t\tXXXXXXXXX [SYNCED SCRAPED]\t SKIPPING... <ID: {}> Totally unable to hit url even in SYNC: {} \t\t TimeTaken = {} \t NOW: {} " .format( row[0], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) else: pc.printMsg( "\t\txxxxx [SYNCED SCRAPED]\t... for ID: {} Found BLOB page SYNC. Will use title. URL: {} \t\t TimeTaken = {} \t NOW: {} " .format(row[0], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) except Exception as e: gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR += 1 pc.printErr( "\t XXXXXXXXXXXXXX [SYNC SCRAPING] XXXX ==>> <ID = {}><src= {} > NOW = {} , \t\t TimeTaken = {} ....Sync Scraping failed too.Will use Title for content... \n \t\t ERROR=> {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()), round((time.time() - t1), 5), e)) # logging.error(traceback.format_exc()) pass endTime = time.time() conn.close() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER_SYNC: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Sync Content Scraping is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Success (Post Sync Content Scraping)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)', gw.CS_SYNC_ITEM_SCRAPED ]) table.add_row([ 'TIME TAKEN - SYNC CONTENT SCRAPING (min)', '-', round((endTime - startTime) / 60, 5) ]) pc.printSucc(table) pc.printErr( "------------------------------------------ ERRORS-SYNC (Written nonetheless, chill) ------------------------------------------------\n" ) table = PrettyTable(['Failures (Post Sync Content Scraping)', 'Value']) table.add_row( ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED EXCEP. - SYNC ', gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR ]) pc.printErr(table) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for Sync Scraping (min) = {} ]\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n")
async def RunAsync(ts): """ Does ASYNC_SERIES_CONNECTION times number of series executions in parallel """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) """ get rows with content alredy present & put in gw.CS_ITEMS_WRITTEN_DIRECT .Will work just for 1st iteration""" c = conn.cursor() q = "select count(*) from " + wc_table + " where length(Content) != 0" no_scraping_needed_item_count = c.execute(q) no_scraping_needed_item_count = c.fetchone()[0] gw.CS_ITEMS_WRITTEN_DIRECT = no_scraping_needed_item_count conn.commit() # conn.close() # gw.SQL_CONN_OPEN -= 1 for i in range(1, gw.ASYNC_SERIES_CONNECTION + 1): gw.CS_BOYS_STILL_PLAYING = 0 pc.printMsg( "\n\n..........-------------\/\/\/------\/\/\/------\/\/\/---------------............ Running Async for {} -th time - \t Numer of Async-runs remaining: {} \t\t NOW: {}\n\n" .format(i, (gw.ASYNC_SERIES_CONNECTION - i), time.strftime("%H:%M:%S", time.localtime()))) # asyncio.get_event_loop().run_until_complete(asyncio.ensure_future(asyncFetchAll(ts,i))) await asyncFetchAll(conn, ts, i) pc.printMsg( "\t\t..........-------------\/\/\/------............ {} -th Async Running is done.Sleeping for 10 sec now......ZZZZZZZzzzzzzzzz\t\t NOW: {}\n\n" .format(i, time.strftime("%H:%M:%S", time.localtime()))) time.sleep(10) conn.close() endTime = time.time() pc.printSucc( "\n\n***************************** All {} Async Content Scraping is Complete. TABLE: {} ******************" .format(gw.ASYNC_SERIES_CONNECTION, wc_table)) print("\n\n") table = PrettyTable([ 'Success (Post ALL series Async Content Scraping)', 'Notation(if any)', 'Value' ]) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[A] (A+B1+B2+C=X)', gw.CS_ITEMS_WRITTEN_DIRECT ]) table.add_row([ 'OUT : ITEMS SCRAPED WITH ASYNC', '[B] (A+B+C=X)', gw.CS_ASYNC_ITEM_SCRAPED ]) table.add_row([ 'TIME TAKEN - ASYNC CONTENT SCRAPING (min)', '-', round((endTime - startTime) / 60, 2) ]) pc.printSucc(table) pc.printErr( "------------------------------------------ ERRORS-ASYNC (Written nonetheless, chill) ------------------------------------------------\n" ) table = PrettyTable([ 'Failures (Counted as-in last run of Async Content Scraping)', 'Value' ]) table.add_row( ['COUNT. UNREACHABLE URLS in ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED SEMA EXCEP. in ASYNC ', gw.CS_ASYNC_SEMA_EXCEPTION_ERR ]) pc.printErr(table) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-r (min) ', round((endTime - startTime) / 60, 2) ]) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for All {} Sync Scraping (min) = {} ]\n' .format(gw.ASYNC_SERIES_CONNECTION, round((endTime - startTime), 5) / 60)) print("\n\n")
async def asyncFetchAll( conn, ts, series_count): #series_count : {1,gw.ASYNC_SERIES_CONNECTION} """ just add the content into Content column, no cleaning OR weightedContent OR UrlString etc. here. """ # wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) # conn = sqlite3.connect(wc_db) # gw.SQL_CONN_OPEN += 1 c = conn.cursor() q = "select * from " + wc_table + " where length(Content) = 0" # only get the rows without content rows_head = c.execute(q) rows = rows_head.fetchall() conn.commit() # conn.close() # gw.SQL_CONN_OPEN -= 1 pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER_ASYNC: DB/wc Connection Opened > ---------------------------------------------\n" ) startTime = time.time() socket.gethostbyname("") connector = TCPConnector(limit=gw.CONNECTION_COUNT, family=socket.AF_INET, verify_ssl=False) pc.printMsg( "\n\n===================================================================== Doing {}-th Async Scraping in the same table =====================================================================\n\n" .format(series_count)) async with ClientSession(headers={'Connection': 'keep-alive'}, connector=connector) as session: tasks = [] sem = asyncio.Semaphore(gw.SEMAPHORE_COUNT) for row in rows: """ ============= row is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ t1 = time.time() if (row[5] and row[6]): # else ignore the entry gw.CS_BOYS_STILL_PLAYING += 1 if gw.CS_BOYS_STILL_PLAYING % gw.CS_BOYS_PLAYING_LIMIT == 0: pc.printMsg( "\t [ASYNC_SCRAPING] sleeping for 1 sec...zzzzzzzzz....... \t BOYS_STILL_PLAYING = {}" .format(gw.CS_BOYS_STILL_PLAYING)) time.sleep(1) # task = asyncio.ensure_future(semaphoreSafeFetch(sem, row, session,series_count)) task = asyncio.ensure_future( semaphoreSafeFetch(conn, sem, row, session, series_count, ts)) tasks.append(task) await asyncio.gather(*tasks) # responses = await asyncio.gather(*tasks) # for row in responses: # if row and len(row[13]) >0: # try: # content = row[13] # conn = sqlite3.connect(wc_db) # c = conn.cursor() # q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?' # d = (content,row[0],row[1]) # c.execute(q,d) # pc.printSucc(" \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE =============== ".format(row[0],row[1])) # conn.commit() # conn.close() # except Exception as e: # logging.error(traceback.format_exc()) # pass # succs = [] # for row in responses: # if row and len(row[13]) >0: # succ = asyncio.ensure_future(semaphoreSqlUpdate(sem,row,ts)) # succs.append(succ) # succsx = await asyncio.gather(*succs) endTime = time.time() pc.printSucc( "\n***************************** {} -th Async Content Scraping is Complete. TABLE: {} ******************" .format(series_count, wc_table)) print("\n\n") table = PrettyTable( ['Success (Post Async Content Scraping)', 'Notation(if any)', 'Value']) table.add_row([ 'OUT : TOTAL ITEMS SCRAPED WITH ASYNC YET', '[B] (A+B+C=X)', gw.CS_ASYNC_ITEM_SCRAPED ]) pc.printSucc(table) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for Async Scraping (min) = {} ]\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n")
def run(ts): """ Scrapes Algolia's HN api for last 7 days & puts data in WC-DB. * max number of entries in algolia's single api call = 1000. So scrape for one day at a time * Link to documentation: https://hn.algolia.com/api Note: 1. For AskHN entries put `` tag & separate threshold 1. For ShowHN entries put `` tag & separate threshold 1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments Input: ts (format: 1598692058.887741) """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n") startTime = time.time() """ here is how you add day to `ts`: from datetime import datetime, timedelta newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094 newts.timestamp() # 1598783633.284871 datetime.fromtimestamp(ts) #2020-08-29 17:15:32 """ """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format """ ts_arr = [str(int(ts))] for i in range(6): new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1) new_ts = new_ts.timestamp() ts_arr.append(str(int(new_ts))) # for t in ts_arr: # print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t)))) index = gw.WC_TOTAL_URL_ENTRIES + 1 for i in range(len(ts_arr)-1): startepoch = ts_arr[i] endepoch = ts_arr[i+1] pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch)) """ getting stories(articles) with upvotes_count > upvotes_threshold Also including: 1. TellHN (<tech_discuss>) 2. LaunchHN (<startup>) """ pc.printWarn(" \t............. scraping stories .............") try: url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_story) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) sourceTag = '' content = '' sourceSite = 'HN' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) if("Launch HN:" in item["title"]): # 1. LaunchHN sourceTag = 'startup' sourceSite += '/launch' if("Tell HN:" in item["title"]): # 2. TellHN sourceTag = 'tech_discuss' sourceSite += '/tell' else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, sourceTag, '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting ShowHNs """ pc.printWarn("\t............. scraping showHNs .............") try: url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_show) res_size = json.loads(data.content)["nbHits"] pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/show' if(item["url"] is None): #as all ShowHNs may not have an url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'sideproj', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass """ getting AskHNs """ pc.printWarn("\t............. scraping askHNs .............") try: url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH) data = web_requests.hitGetWithRetry(url_ask) res_size = json.loads(data.content)["nbHits"] pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size)) gw.HN_TOTAL_ITEMS_GOT_YET += res_size items_arr = json.loads(data.content)["hits"] for item in items_arr: content = '' sourceSite = 'HN/ask' if(item["url"] is None): #as AskHNs dont have any url ...hihi... url = 'https://news.ycombinator.com/item?id='+str(item["objectID"]) # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url)) # print(json.dumps(item, indent = 4)) if(item["story_text"] is not None): content = text_actions.getTextFromHtml(item["story_text"]) else: url = item["url"] entry = [ index, sourceSite, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.HNDate(str(item["created_at"])), item["title"], url, 'prog_query', '', item["points"], item["num_comments"], '', '', text_actions.clean_text(content) ] c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index=index+1 pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e)) logging.error(traceback.format_exc()) pass endTime = time.time() conn.commit() conn.close() gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n") pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table)) print("\n\n") table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES]) table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)]) pc.printSucc(table) print("\n\n")
async def fetchWithRetry(conn, row, session, series_count, ts): """ Hits ulr(with retires): * if status == 200: return resposne ((raw)Content & (raw)WeightedContent in row) * if still unable to hit after retries: Content = Title , WeightedContent = Title INPUT: `row` is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ status = 400 retry_cnt = 2 sleep_time = 0.1 t1 = time.time() while retry_cnt > 0 and status != 200: async with session.get(row[6], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=gw.CS_ASYNC_REQ_TIMEOUT) as response: # res = await response.content.read() # returns blob which gives error while ContentFormatter; hence discarded res = await response.text() status = response.status if (status == 200 and len(res) != 0): gw.CS_ASYNC_ITEM_SCRAPED += 1 gw.CS_BOYS_STILL_PLAYING -= 1 pc.printSucc( "\t\t <ID = {}><src= {} > ============== [ASYNCED SCRAPED#{}] Done ....... \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], series_count, round((round((time.time() - t1), 5)), 5), time.strftime("%H:%M:%S", time.localtime()))) row_list = list(row) row_list[13] = res row = tuple(row_list) # wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) # conn = sqlite3.connect(wc_db) # gw.SQL_CONN_OPEN += 1 try: c = conn.cursor() q = 'update ' + wc_table + ' set Content = ? where ID = ? and SourceSite = ?' d = (row[13], row[0], row[1]) c.execute(q, d) pc.printWarn( " \t\t ============== <ID= {} ><{}> [ASYNC ContentScraped] \t INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== " .format(row[0], row[1], gw.SQL_CONN_OPEN)) conn.commit() except Exception as e: pc.printMsg( " \t\t === XXXX ====== <ID= {} ><{}> [ASYNC ContentScraped] \t ERRR in INSERTED INTO TABLE :: gw.SQL_CONN_OPEN = {} =============== " .format(row[0], row[1], gw.SQL_CONN_OPEN)) logging.error(traceback.format_exc()) pass # conn.close() # gw.SQL_CONN_OPEN -= 1 return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row[0], row[1], status, row[6][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) if series_count == gw.ASYNC_SERIES_CONNECTION: gw.CS_ASYNC_URL_UNREACHABLE += 1 pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... Will try sync later: {} \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], row[6], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) # return row return []
def run(ts): """ I. Creates wc_table(in wc.db) & wp_table(in wp.dp) for the week II. Runs following scrapers serially and updates them in WC-DB: 1. hn_scraper.py 2. r_scraper.py 4. ph_scraper.py => Api exists, Scraping not allowed(doint it anyway) 3. ih_scraper.py => No Api, Scraping not allowed(postponed for later) Input: float(timestamp) - set when the main.py run is triggered * float because o/w `datetime.fromtimestamp(ts)` wont run on int Outpu: None, just put data in WC-DB """ startTime = time.time() """ Initialize the weekly content tables in wc.db and wp.db""" wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'" .format(wc_table)) if c.fetchone()[0] == 1: # table exists, flush away! c.execute("delete from {}".format(wc_table)) else: # creting new table c.execute( "CREATE TABLE {} (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, SourceTags,ModelTags,NumUpvotes, NumComments, PopI,WeightedContent,Content)" .format(wc_table)) pc.printSucc( "\n**************************************************** wc_table created => {} **************************************************** \n" .format(wc_table)) wp_db = 'dbs/wp.db' wp_table = 'wp_' + str(int(ts)) conn = sqlite3.connect(wp_db, timeout=10) c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{}'" .format(wp_table)) if c.fetchone()[0] == 1: # table exists, flush away! c.execute("delete from {}".format(wc_table)) else: # creting new table c.execute('''CREATE TABLE {} (ID, SourceSite, ProcessingDate,ProcessingEpoch,CreationDate, Title, Url, ThumbnailUrl,SourceTags,NumUpvotes, NumComments, PopI,Content)''' .format(wp_table)) pc.printSucc( "\n**************************************************** wp_table created => {} **************************************************** \n" .format(wp_table)) """ Run the scrapers sequentially """ pc.printWarn( ". . . . . . . . . . . . . . . ...... Started Running all the scrapers ...... . . . . . . . . . . . . . . .\n" ) try: hn_scraper.run(ts) pc.printSucc( "\n================ HH url scraper run: Complete ================\n" ) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-HN xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\t>>> Error = {}" .format(str(e))) logging.error(traceback.format_exc()) pass try: r_scraper.run(ts) pc.printSucc( " \n================ Reddit url scraper run: Complete ================\n" ) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-Reddit xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}" .format(str(e))) logging.error(traceback.format_exc()) pass try: ph_scraper.run(ts) pc.printSucc( " \n================ PH url scraper run: Complete ================\n" ) except Exception as e: pc.printErr( " xxxxxxxxxxxxxxxxxxxxxxxxx Error in Running Url Scraper-PH xxxxxxxxxxxxxxxxxxxxxxxxx \n \t\tError = {}" .format(str(e))) logging.error(traceback.format_exc()) pass # try: # ih_scraper.run(ts) # print(" \n====== IH url scraper run: Complete ======\n") # except Exception as e: # print(" XXXXXXXXXXXX Error in scraping IH for url XXXXXXXXXXXXXXXXX \n \t\tError = {}".format(str(e))) # pass #TODO: add Lobsters here endTime = time.time() pc.printSucc( " ********************************************** URL Scraping(HN,r,PH) is complete *******************************************\n" ) print("\n\n") table = PrettyTable(['Entity (Post all URL Scraping)', 'Value']) table.add_row(['TOTAL URL ITEMS IN WC TABLE ', gw.WC_TOTAL_URL_ENTRIES]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-All (min) ', round((endTime - startTime) / 60, 2) ]) pc.printSucc(table) print("\n\n")
def run(ts): """ Get top 1000 submissions of the listed subreddits (max_limit is 1000; should be enough) Hence no use of `ts` here """ startTime = time.time() wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) pc.printSucc( '@[{}] >>>>>> Started r-scraper ................... => TABLE: {}\n'. format(datetime.fromtimestamp(ts), wc_table)) pc.printMsg( "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n" ) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() blob_pages = ['.jpg', '.png', '.gif', '.mp3', '.mp4'] # these give blob data; no point in scraping them index = gw.WC_TOTAL_URL_ENTRIES + 1 # Setup Client reddit = praw.Reddit( client_id=vault.R_CLIENT_ID, # PERSONAL_USE_SCRIPT_14_CHARS client_secret=vault.R_CLIENT_SECRET, # SECRET_KEY_27_CHARS user_agent=vault.R_USER_AGENT, # YOUR_APP_NAME username=vault.R_USERNAME, # YOUR_REDDIT_USER_NAME password=vault.R_PASSWORD) # YOUR_REDDIT_LOGIN_PASSWORD for subreddit, tag_arr in LIST.items(): try: pc.printWarn( "\t ............ Subreddit@R_UrlScraping : {} .............". format(subreddit)) sr = reddit.subreddit(subreddit) # for submission in sr.top('day',limit=10): # For testing.... # for submission in sr.top('year',limit=1000): #remove this & uncomemnt below line ENTRIES_IN_THIS_SUBRDDIT = 0 for submission in sr.top('week', limit=gw.R_ITEM_LIMIT_PER_SUBREDDIT ): #NOTE: max limit is 1000 #Check1: if the post is unlocked by mods content = '' """ Fixing permalink type urls """ url = submission.url if (url[:2] == '/r'): url = "https://www.reddit.com" + url if (submission.locked == False): #Check2: if post is just an image, discard it if submission.url[ -4:] not in blob_pages: #as reddit currentluy hosts .png & .gif only # if permalink is a substring of url OR submission is a selfpost (text-only) => no need to scrape # NOTE: I know there might be links in post with some discription+link to other article he's reffering; but not worth wasting precious processing time if ((submission.permalink in submission.url) or (submission.is_self == True)): content = submission.selftext entry = [ index, "r/" + subreddit, datetime.fromtimestamp(ts).date(), int(ts), date_conversion.RedditDate( str(datetime.fromtimestamp( submission.created))), submission.title, url, json.dumps(tag_arr), '', submission.score, submission.num_comments, '', '', text_actions.clean_text(content) ] # csv_functions.putToCsv(csv_file,entry) c.execute( 'INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry) index += 1 ENTRIES_IN_THIS_SUBRDDIT += 1 gw.R_TOTAL_ITEMS_GOT_YET += ENTRIES_IN_THIS_SUBRDDIT pc.printMsg( "\t\t\t\t\t ====> ENTRIES_IN_THIS_SUBRDDIT = {} \t\t | \t gw.R_TOTAL_ITEMS_GOT_YET = {}" .format(ENTRIES_IN_THIS_SUBRDDIT, gw.R_TOTAL_ITEMS_GOT_YET)) except Exception as e: pc.printErr( " \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n" .format(index, e)) logging.error(traceback.format_exc()) pass endTime = time.time() gw.WC_TOTAL_URL_ENTRIES += gw.R_TOTAL_ITEMS_GOT_YET conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Reddit Url Scraping is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable(['Entity (Post r URL Scraping)', 'Value']) table.add_row(['TOTAL URLS FETCHED by HN', gw.R_TOTAL_ITEMS_GOT_YET]) table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES]) table.add_row([ 'TIME TAKEN FOR URL SCRAPING-r (min) ', round((endTime - startTime) / 60, 2) ]) pc.printSucc(table) print("\n\n")
def run(ts): nest_asyncio.apply() # to be able to run async loop from aj async loop wc_table = 'wc_' + str(int(ts)) pc.printMsg( '@[{}] >>>>>> Started Content-scraper(ASYNC) .......[Sema = {}, conn_lim ={}]............ => TABLE: {}\n' .format(datetime.fromtimestamp(ts), gw.SEMAPHORE_COUNT, gw.CONNECTION_COUNT, wc_table)) startTime = time.time() """ scrape content in async """ asyncio.get_event_loop().run_until_complete( asyncio.ensure_future(RunAsync(ts))) time.sleep(10) """ scrape remaining items with sync """ RunSync(ts) """ formatting everything in the end-done in sync """ time.sleep(10) ContentFormatting(ts) endTime = time.time() pc.printSucc( "\n\n\n\n\n****************** Content Scraping is Complete , TABLE: {} ********************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Entities (Post Content Scraping-all)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'CS_OUT : ITEMS SCRAPED WITH ASYNC', '[A] (A+B+C=X)', gw.CS_ASYNC_ITEM_SCRAPED ]) table.add_row([ 'CS_OUT : ITEMS WRITTEN DIRECT(no scraping needed) ', '[B] (A+B+C=X)', gw.CS_ITEMS_WRITTEN_DIRECT ]) table.add_row([ 'CS_OUT : ITEMS SCRAPED WITH SYNC', '[C] (A+B+C=X)', gw.CS_SYNC_ITEM_SCRAPED ]) table.add_row([ 'CF_OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK ]) table.add_row([ 'CF_OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT ]) pc.printSucc(table) pc.printErr( "\n\n------------------------------------------ ERRORS (Written nonetheless, chill) ------------------------------------------------\n" ) table = PrettyTable(['Failures (Post Content Scraping-all)', 'Value']) table.add_row( ['COUNT. UNREACHABLE URLS - ASYNC ', gw.CS_ASYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED SEMA EXCEP. - ASYNC ', gw.CS_ASYNC_SEMA_EXCEPTION_ERR ]) table.add_row( ['COUNT. UNREACHABLE URLS - SYNC ', gw.CS_SYNC_URL_UNREACHABLE]) table.add_row([ 'COUNT. TRY/CATCHED EXCEP. - SYNC ', gw.CS_SYNC_TRIED_CATCH_EXCEPTION_ERR ]) pc.printErr(table) print("\n") pc.printWarn( '\t\t\t\t------------------------->>>>>> [ Time Taken(min) = {} ]\n\n\n\n\n\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n\n\n")
async def asyncFetchAll(ts): """ INPUT: ts (format: 1598692058.887741) """ global CONNTECTION_COUNT, SEMAPHORE_COUNT tasks = [] sem = asyncio.Semaphore(SEMAPHORE_COUNT) #==========================init connection wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db, timeout=10) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER: DB Connection Opened > ---------------------------------------------\n" ) stratTime = time.time() # """ Initialize the output file """ # headers = ['ID', 'SourceSite', 'ProcessingDate','ProcessingEpoch','CreationDate', 'Title', 'Url', 'SourceTags','ModelTags','NumUpvotes', 'NumComments', 'PopI','WeightedContent','Content'] # csv_functions.creteCsvFile(csv_out,headers) global ENTRIES_TO_BE_WRITTEN global WRITTEN_ENTRIES_ASYNC_SCRAPED global WRITTEN_ENTRIES_ASYNC_DIRECT global ASYNC_ENTRIES_TO_BE_SCRAPED connector = TCPConnector(limit=CONNTECTION_COUNT, family=socket.AF_INET, verify_ssl=False) # connector = TCPConnector(limit=CONNTECTION_COUNT) # connector = ProxyConnector.from_url('http://*****:*****@127.0.0.1:1080') async with ClientSession(headers={'Connection': 'keep-alive'}, connector=connector) as session: q = "select * from " + wc_table rows_head = c.execute(q) rows = rows_head.fetchall() for row in rows: """ ============= row is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ ENTRIES_TO_BE_WRITTEN += 1 if (len(row[13]) != 0): pc.printWarn( "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()))) clean_content = row[13] #Already cleaned in url_scraper url_strings_content = getUrlString(row[13]) clean_title = clean_text(row[5]) clean_weighted_content = clean_text( row[12]) + " " + clean_title + " " + url_strings_content query = 'update ' + wc_table + ' set Content = ? , WeightedContent = ? where ID = ? and SourceSite = ?' data = (clean_content, clean_weighted_content, row[0], row[1]) c.execute(query, data) WRITTEN_ENTRIES_ASYNC_DIRECT += 1 pc.printSucc( " \t\t ============== <ID= {} ><{}> [Direct] INSERTED INTO TABLE =============== " .format(row[0], row[1])) elif (row[5] and row[6]): # else ignore the entry ASYNC_ENTRIES_TO_BE_SCRAPED += 1 print("\t\t\t\t\t SENT...... SENT_COUNT = {}".format( ASYNC_ENTRIES_TO_BE_SCRAPED)) # if(ASYNC_ENTRIES_TO_BE_SCRAPED%100 == 0): # pc.printMsg("\t\t\t.......................zzzzzzzzzzzzzzzzzzzzzzzzzzzzzz <NAP TIME> for 5 sec After 100 async-requests while content scraping #ZarooriHaiJi zzzzzzzzzzzzzzz.......................") # time.sleep(5) task = asyncio.ensure_future( semaphoreSafeFetch(sem, row, session)) tasks.append(task) responses = await asyncio.gather(*tasks) for row in responses: if row: clean_content = clean_text(row[13]) url_strings_content = getUrlString(row[13]) clean_title = clean_text(row[5]) clean_weighted_content = clean_text( row[12]) + " " + clean_title + " " + url_strings_content query = 'update ' + wc_table + ' set Content = ? , WeightedContent = ? where ID = ? and SourceSite = ?' data = (clean_content, clean_weighted_content, row[0], row[1]) c.execute(query, data) WRITTEN_ENTRIES_ASYNC_SCRAPED += 1 pc.printSucc( " \t\t ============== <ID= {} ><{}> [Scraped] INSERTED INTO TABLE =============== " .format(row[0], row[1])) endTime = time.time() conn.commit() conn.close() pc.printMsg( "\t -------------------------------------- < CONTENT_SCRAPER: DB Connection Closed > ---------------------------------------------\n" )