async def fetchWithRetry(row, session): status = 400 retry_cnt = 3 sleep_time = 10 TIMEOUT = 60 while retry_cnt > 0 and status != 200: async with session.get(row["Url"], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) urlstrings = text_actions.getUrlString(row["Content"]) row["WeightedContent"] = text_actions.clean_text( text_actions.weightedcontentfromhtml(res) + row["Title"] + urlstrings) row["Content"] = text_actions.clean_text( text_actions.contentfromhtml(res) + urlstrings) if (len(row["Content"]) == 0): row["WeightedContent"] = text_actions.clean_text( row["Title"]) row["Content"] = text_actions.clean_text(row["Title"]) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- Unable to hit URL(ERR_CODE={}): {} Sleeping for {} Retries remaining = {} -------------x" .format(status, row["Url"], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , " .format(row["ID"], row["SourceSite"], row["Url"])) global SKIPPED_ASYNC SKIPPED_ASYNC += 1 return row
def RunSync(ts): """ Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url. * NOTE: * If conent is already present in the table, "clean" it too & append the newly scraped content to it. * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k Input: ts (format: 1598692058.887741) """ pc.printMsg( '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n' .format(datetime.fromtimestamp(ts), 'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '.csv' csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str( int(ts)) + '_wc_sync.csv' index = 1 headers = [ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ] csv_functions.creteCsvFile(csv_dest_file, headers) f = csv.writer(open(csv_dest_file, "w")) # Flush the old file f.writerow([ 'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch', 'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags', 'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content' ]) with open(csv_src_file, mode='r') as csvfile: csv_reader = csv.DictReader(csvfile) line_count = 0 for row in csv_reader: if line_count == 0: print(f'Headers are {", ".join(row)}') line_count += 1 #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is if (len(row["Content"]) != 0): pc.printWarn( "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(row["Title"] + row["WeightedContent"]) + text_actions.getUrlString( row["Content"]), #add the url-words too text_actions.clean_text(row["Content"]) + text_actions.getUrlString(row["Content"]) ] global WRITTEN_ENTRIES_SYNC WRITTEN_ENTRIES_SYNC += 1 f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) #CHECK2(pre scraping): if(url == NULL)=>discard #CHECK3(pre scraping): if (row["title"]==NULL)=>discard elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)): pc.printWarn( "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()))) time.sleep(0.0001) try: # response = web_requests.hitGetWithRetry(url,TIMEOUT=10) response = web_requests.hitGetWithRetry( row["Url"], '', False, 2, 0.5, 60) # if response.status_code == 200: if response != -1: # content = text_actions.contentfromhtml(response) #NOTE: for sync content = text_actions.contentfromhtml( response.text) #NOTE: for Async urlstrings = text_actions.getUrlString(content) content += urlstrings #add the url-words too # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too #NOTE: for sync weightedcontent = text_actions.weightedcontentfromhtml( response.text ) + row[ "Title"] + urlstrings #add the url-words too #NOTE: for async line_count += 1 #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"]) if (len(content) == 0): content = row["Title"] weightedcontent = row["Title"] else: entry = [ row["ID"], row["SourceSite"], row["ProcessingDate"], row["ProcessingEpoch"], row["CreationDate"], row["Title"], row["Url"], row["SourceTags"], row["ModelTags"], row["NumUpvotes"], row["NumComments"], row["PopI"], text_actions.clean_text(weightedcontent), text_actions.clean_text(content) ] f = csv.writer(open(csv_dest_file, "a")) f.writerow(entry) pc.printMsg( "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}" .format( row["ID"], time.strftime("%H:%M:%S", time.localtime()))) else: global SKIPPED_SYNC SKIPPED_SYNC += 1 pc.printErr( "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , " .format(row["ID"], row["Url"])) except Exception as e: global FAILED_SYNC FAILED_SYNC += 1 pc.printErr( "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}" .format(row["ID"], time.strftime("%H:%M:%S", time.localtime()), e)) pass pc.printMsg( "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n" .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv')) pc.printMsg( "\n----------------------------------------------------------------------------------\n" ) pc.printMsg( "|\tWRITTEN_ENTRIES_SYNC \t | \t {} \t|".format(WRITTEN_ENTRIES_SYNC)) pc.printMsg("|\tSKIPPED_SYNC \t | \t {} \t|".format(SKIPPED_SYNC)) pc.printMsg("|\tFAILED_SYNC \t | \t {} \t|".format(FAILED_SYNC)) pc.printMsg( "\n----------------------------------------------------------------------------------\n" )
def ContentFormatting(ts): """ Do: 0. Update Content & WeightedContent column for each row 1. get url_strings_content = getUrlString(row[13]) -> add it in weighted_content 2. do clean_text(row[13]) 2. do clean_text(row[12]) 3. clean text clean_text(row[5]) -> add it in weighted_content :: clean_text(row[12]) + " " + clean_title + " " + url_strings_content 4. if content col is still null; put title into it & in weightedContent too """ wc_db = 'dbs/wc.db' wc_table = 'wc_' + str(int(ts)) conn = sqlite3.connect(wc_db) c = conn.cursor() pc.printMsg( "\t -------------------------------------- < Content Formatter: DB/wc Connection Opened > ---------------------------------------------\n" ) startTime = time.time() pc.printWarn("\tRunning ContentFormatter for wc ....... \t NOW: {}".format( time.strftime("%H:%M:%S", time.localtime()))) pc.printWarn( "\t\t. . . . . . . . . . . .......... Content Formatting Started @Content_Scraper ........... . . . . . . . . . . ." ) signal.signal(signal.SIGALRM, timeout_handler) # timeouts on few function calls, see below q = "select * from " + wc_table rows_head = c.execute(q) rows = rows_head.fetchall() conn.commit() for row in rows: t1 = time.time() row_list = list(row) if (len(row[13]) != 0): gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK += 1 clean_title = clean_text(row_list[5]) if len(row_list[13]) == 0: pc.printWarn( "\t\t\t\t --------- No content found on cleaning, using Title as Content :(" ) row_list[13] = clean_title row_list[12] = clean_title else: raw_content = row_list[13] signal.alarm(200) # Timeout of 200 sec on function call content = clean_title # if timeout happens, this will be the value of content try: content = text_actions.contentfromhtml(raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@ContentFromHtml ! ....using Title as content " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call clean_content = clean_title # if timeout happens, this will be the value of content try: clean_content = clean_text(content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@CleanText ! ....using Title as content " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call weighted_content = clean_title # if timeout happens, this will be the value of content try: weighted_content = text_actions.weightedcontentfromhtml( raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@WeightedContentFromHtml ! ....using Title as weightedcontent " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call clean_weighted_content = clean_title # if timeout happens, this will be the value of content try: clean_weighted_content = clean_text(weighted_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@CleanText ! ....using Title as weightedcontent " .format(row[0], row[1])) # pc.printWarn(exc) pass signal.alarm(200) # Timeout of 200 sec on function call url_string_text = '' # if timeout happens, this will be the value of content try: url_string_text = getUrlString(raw_content) except Exception as exc: pc.printErr( "\t <ID = {}><src= {} > Timeout of 200 sec happened on URL_STRING@getUrlString ! ....using empty str as url_string_text " .format(row[0], row[1])) # pc.printWarn(exc) pass row_list[13] = clean_content row_list[ 12] = clean_weighted_content + " " + url_string_text + " " + clean_title row = tuple(row_list) pc.printWarn( "\t <ID = {}><src= {} > [Content Formatting] Done................ \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) content = row[13] q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ? where ID = ? and SourceSite = ?' d = (row[13], row[12], row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-with content INSERTED INTO TABLE =============== ".format(row[0],row[1])) else: #No content gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT += 1 pc.printMsg( "\t <ID = {}><src= {} > [Content Formatting] No content.Using title finally................ \t\t TimeTaken = {} \t NOW: {}" .format(row[0], row[1], round((time.time() - t1), 5), time.strftime("%H:%M:%S", time.localtime()))) clean_title = clean_text(row_list[5]) content = clean_title q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ? where ID = ? and SourceSite = ?' d = (content, content, row[0], row[1]) c.execute(q, d) conn.commit() # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-without content INSERTED INTO TABLE =============== ".format(row[0],row[1])) endTime = time.time() conn.close() pc.printMsg( "\t -------------------------------------- < Content Formatter: DB/wc Connection Closed > ---------------------------------------------\n" ) pc.printSucc( "\n\n***************************** Content Formatting is Complete. TABLE: {} ******************" .format(wc_table)) print("\n\n") table = PrettyTable( ['Success (Post Content Formatting)', 'Notation(if any)', 'Value']) table.add_row([ 'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)', gw.WC_TOTAL_URL_ENTRIES ]) table.add_row([ 'OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK ]) table.add_row([ 'OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)', gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT ]) table.add_row([ 'TIME TAKEN - CONTENT FORMATTING (min)', '-', round((endTime - startTime) / 60, 5) ]) pc.printSucc(table) print("\n") pc.printWarn( '\t\t\t------------------------->>>>>> [ TimeTaken for Content Formatting (min) = {} ]\n' .format(round((endTime - startTime), 5) / 60)) print("\n\n")
async def fetchWithRetry(row, session, csv_out): """ Hits ulr(with retires): * if status == 200: put content into csv * if still unable to hit after retries: Content = Title , WeightedContent = Title """ status = 400 retry_cnt = 2 sleep_time = 10 TIMEOUT = 10 while retry_cnt > 0 and status != 200: async with session.get(row["Url"], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}" .format(row["ID"], row["SourceSite"], time.strftime("%H:%M:%S", time.localtime()))) urlstrings = text_actions.getUrlString(row["Content"]) row["WeightedContent"] = text_actions.weightedcontentfromhtml( res) + row["Title"] + urlstrings row["Content"] = text_actions.contentfromhtml(res) + urlstrings # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) if (len(row["Title"]) != 0): if len(row["Content"]) == 0: row["WeightedContent"] = row["Title"] row["Content"] = row["Title"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_SCRAPED WRITTEN_ENTRIES_ASYNC_SCRAPED += 1 pc.printMsg( " \t\t ============== [Scraped] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) else: global WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING += 1 pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING for <ID = {}><src= {} > As No Title xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row["ID"], row["SourceSite"])) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row["ID"], row["SourceSite"], status, row["Url"][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} " .format(row["ID"], row["SourceSite"], row["Url"])) if len(row["Content"]) == 0: row["WeightedContent"] = row["Title"] row["Content"] = row["Title"] await write_result(csv_out, row) global WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR += 1 pc.printMsg( " \t\t\t ============== [Unreachable URL] Done Writing into csv for <ID = {}><src= {} > =============== " .format(row["ID"], row["SourceSite"])) return row
async def fetchWithRetry(row, session): """ Hits ulr(with retires): * if status == 200: return resposne ((raw)Content & (raw)WeightedContent in row) * if still unable to hit after retries: Content = Title , WeightedContent = Title INPUT: `row` is an array with indices: ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6), SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13) """ status = 400 retry_cnt = 2 sleep_time = 5 # TIMEOUT = ClientTimeout(total=20) TIMEOUT = 20 while retry_cnt > 0 and status != 200: async with session.get(row[6], ssl=ssl.create_default_context( purpose=ssl.Purpose.CLIENT_AUTH), timeout=TIMEOUT) as response: res = await response.text() # res = await response.content.read() # res = await text_actions.clean_text(str(response.content.read())) res = text_actions.clean_text(str(res)) # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore') #FIXME: not working status = response.status if (status == 200 and len(res) != 0): pc.printSucc( "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}" .format(row[0], row[1], time.strftime("%H:%M:%S", time.localtime()))) row_list = list(row) row_list[12] = text_actions.weightedcontentfromhtml(res) row_list[13] = text_actions.contentfromhtml(res) # for i in range(len(row_list)): # row_list[i] = row_list[i].decode("utf-8", "ignore") row = tuple(row_list) # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime()))) # time.sleep(0.001) if (len(row[13]) == 0): global ERR_ASYNC_NO_CONTENT_IN_SCRAPING ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1 pc.printErr( "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n" .format(row[0], row[1])) return row else: retry_cnt -= 1 pc.printWarn( "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}......... Sleeping for {} Retries remaining = {} -------------x" .format(row[0], row[1], status, row[6][:25], sleep_time, retry_cnt)) await asyncio.sleep(sleep_time) pc.printErr( "\t\txxxxx For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} " .format(row[0], row[1], row[6])) global ERR_ASYNC_ON_URL_ERROR ERR_ASYNC_ON_URL_ERROR += 1 pc.printMsg( " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== " .format(row[0], row[1])) return row