Exemplo n.º 1
0
async def fetchWithRetry(row, session):
    status = 400
    retry_cnt = 3
    sleep_time = 10
    TIMEOUT = 60

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.clean_text(
                    text_actions.weightedcontentfromhtml(res) + row["Title"] +
                    urlstrings)
                row["Content"] = text_actions.clean_text(
                    text_actions.contentfromhtml(res) + urlstrings)
                if (len(row["Content"]) == 0):
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"])
                    row["Content"] = text_actions.clean_text(row["Title"])
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- Unable to hit URL(ERR_CODE={}): {}  Sleeping for {} Retries remaining = {} -------------x"
                    .format(status, row["Url"], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    global SKIPPED_ASYNC
    SKIPPED_ASYNC += 1
    return row
Exemplo n.º 2
0
def RunSync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url.
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n'
        .format(datetime.fromtimestamp(ts),
                'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))

    csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '.csv'
    csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '_wc_sync.csv'
    index = 1
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_dest_file, headers)

    f = csv.writer(open(csv_dest_file, "w"))  # Flush the old file
    f.writerow([
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ])
    with open(csv_src_file, mode='r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Headers are {", ".join(row)}')
                line_count += 1
            #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is
            if (len(row["Content"]) != 0):
                pc.printWarn(
                    "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    text_actions.clean_text(row["Title"] +
                                            row["WeightedContent"]) +
                    text_actions.getUrlString(
                        row["Content"]),  #add the url-words too
                    text_actions.clean_text(row["Content"]) +
                    text_actions.getUrlString(row["Content"])
                ]
                global WRITTEN_ENTRIES_SYNC
                WRITTEN_ENTRIES_SYNC += 1
                f = csv.writer(open(csv_dest_file, "a"))
                f.writerow(entry)
            #CHECK2(pre scraping): if(url == NULL)=>discard
            #CHECK3(pre scraping): if (row["title"]==NULL)=>discard
            elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)):
                pc.printWarn(
                    "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                time.sleep(0.0001)
                try:
                    # response = web_requests.hitGetWithRetry(url,TIMEOUT=10)
                    response = web_requests.hitGetWithRetry(
                        row["Url"], '', False, 2, 0.5, 60)
                    # if response.status_code == 200:
                    if response != -1:
                        # content = text_actions.contentfromhtml(response)  #NOTE: for sync
                        content = text_actions.contentfromhtml(
                            response.text)  #NOTE: for Async
                        urlstrings = text_actions.getUrlString(content)
                        content += urlstrings  #add the url-words too
                        # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too      #NOTE: for sync
                        weightedcontent = text_actions.weightedcontentfromhtml(
                            response.text
                        ) + row[
                            "Title"] + urlstrings  #add the url-words too        #NOTE: for async
                        line_count += 1
                        #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"])
                        if (len(content) == 0):
                            content = row["Title"]
                            weightedcontent = row["Title"]
                        else:
                            entry = [
                                row["ID"], row["SourceSite"],
                                row["ProcessingDate"], row["ProcessingEpoch"],
                                row["CreationDate"], row["Title"], row["Url"],
                                row["SourceTags"], row["ModelTags"],
                                row["NumUpvotes"], row["NumComments"],
                                row["PopI"],
                                text_actions.clean_text(weightedcontent),
                                text_actions.clean_text(content)
                            ]

                        f = csv.writer(open(csv_dest_file, "a"))
                        f.writerow(entry)
                        pc.printMsg(
                            "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}"
                            .format(
                                row["ID"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    else:
                        global SKIPPED_SYNC
                        SKIPPED_SYNC += 1
                        pc.printErr(
                            "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , "
                            .format(row["ID"], row["Url"]))
                except Exception as e:
                    global FAILED_SYNC
                    FAILED_SYNC += 1
                    pc.printErr(
                        "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}"
                        .format(row["ID"],
                                time.strftime("%H:%M:%S", time.localtime()),
                                e))
                    pass
    pc.printMsg(
        "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n"
        .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
    pc.printMsg(
        "|\tWRITTEN_ENTRIES_SYNC \t  | \t {} \t|".format(WRITTEN_ENTRIES_SYNC))
    pc.printMsg("|\tSKIPPED_SYNC          \t | \t {} \t|".format(SKIPPED_SYNC))
    pc.printMsg("|\tFAILED_SYNC           \t | \t {} \t|".format(FAILED_SYNC))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
Exemplo n.º 3
0
def ContentFormatting(ts):
    """ 
    Do:
        0. Update Content & WeightedContent column for each row
        1. get url_strings_content = getUrlString(row[13]) -> add it in weighted_content
        2. do clean_text(row[13])
        2. do clean_text(row[12])
        3. clean text clean_text(row[5]) -> add it in weighted_content :: clean_text(row[12]) + " " + clean_title + " " + url_strings_content
        4. if content col is still null; put title into it & in weightedContent too
    """

    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    conn = sqlite3.connect(wc_db)
    c = conn.cursor()
    pc.printMsg(
        "\t -------------------------------------- < Content Formatter: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    startTime = time.time()
    pc.printWarn("\tRunning ContentFormatter for wc ....... \t NOW: {}".format(
        time.strftime("%H:%M:%S", time.localtime())))
    pc.printWarn(
        "\t\t. .  .  .  .  .  .  .  .  .  .  .......... Content Formatting Started @Content_Scraper ...........  .  .  .  .  .  .  .  .  .  .  ."
    )

    signal.signal(signal.SIGALRM,
                  timeout_handler)  # timeouts on few function calls, see below
    q = "select * from " + wc_table
    rows_head = c.execute(q)
    rows = rows_head.fetchall()
    conn.commit()
    for row in rows:
        t1 = time.time()
        row_list = list(row)
        if (len(row[13]) != 0):
            gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK += 1
            clean_title = clean_text(row_list[5])
            if len(row_list[13]) == 0:
                pc.printWarn(
                    "\t\t\t\t --------- No content found on cleaning, using Title as Content :("
                )
                row_list[13] = clean_title
                row_list[12] = clean_title
            else:
                raw_content = row_list[13]
                signal.alarm(200)  # Timeout of 200 sec on function call
                content = clean_title  # if timeout happens, this will be the value of content
                try:
                    content = text_actions.contentfromhtml(raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@ContentFromHtml ! ....using Title as content "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                clean_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    clean_content = clean_text(content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on CONTENT@CleanText ! ....using Title as content "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                weighted_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    weighted_content = text_actions.weightedcontentfromhtml(
                        raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@WeightedContentFromHtml ! ....using Title as weightedcontent "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                clean_weighted_content = clean_title  # if timeout happens, this will be the value of content
                try:
                    clean_weighted_content = clean_text(weighted_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on WEIGHTED_CONTENT@CleanText ! ....using Title as weightedcontent "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                signal.alarm(200)  # Timeout of 200 sec on function call
                url_string_text = ''  # if timeout happens, this will be the value of content
                try:
                    url_string_text = getUrlString(raw_content)
                except Exception as exc:
                    pc.printErr(
                        "\t <ID = {}><src= {} > Timeout of 200 sec happened on URL_STRING@getUrlString ! ....using empty str as url_string_text "
                        .format(row[0], row[1]))
                    # pc.printWarn(exc)
                    pass

                row_list[13] = clean_content
                row_list[
                    12] = clean_weighted_content + " " + url_string_text + " " + clean_title

            row = tuple(row_list)

            pc.printWarn(
                "\t <ID = {}><src= {} > [Content Formatting] Done................ \t\t TimeTaken = {} \t NOW: {}"
                .format(row[0], row[1], round((time.time() - t1), 5),
                        time.strftime("%H:%M:%S", time.localtime())))
            content = row[13]
            q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ?  where ID = ? and SourceSite = ?'
            d = (row[13], row[12], row[0], row[1])
            c.execute(q, d)
            conn.commit()
            # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-with content INSERTED INTO TABLE =============== ".format(row[0],row[1]))
        else:  #No content
            gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT += 1
            pc.printMsg(
                "\t <ID = {}><src= {} > [Content Formatting] No content.Using title finally................ \t\t TimeTaken = {} \t NOW: {}"
                .format(row[0], row[1], round((time.time() - t1), 5),
                        time.strftime("%H:%M:%S", time.localtime())))
            clean_title = clean_text(row_list[5])
            content = clean_title
            q = 'update ' + wc_table + ' set Content = ?, WeightedContent = ?  where ID = ? and SourceSite = ?'
            d = (content, content, row[0], row[1])
            c.execute(q, d)
            conn.commit()
            # pc.printSucc(" \t\t ============== <ID= {} ><{}> [Content Formatting]-without content INSERTED INTO TABLE =============== ".format(row[0],row[1]))
    endTime = time.time()

    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < Content Formatter: DB/wc Connection Closed > ---------------------------------------------\n"
    )

    pc.printSucc(
        "\n\n***************************** Content Formatting is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(
        ['Success (Post Content Formatting)', 'Notation(if any)', 'Value'])
    table.add_row([
        'IN : gw.WC_TOTAL_URL_ENTRIES ', '[X] (A+B+C=X)',
        gw.WC_TOTAL_URL_ENTRIES
    ])
    table.add_row([
        'OUT : ITEMS PUT IN WITH SCRAPED CONTENT', '[P] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_OK
    ])
    table.add_row([
        'OUT : x--ITEMS PUT IN WITH TITLE AS CONTENT--x', '[Q] (P+Q=X)',
        gw.CS_ITEM_PUT_IN_AFTER_CONTENT_FORMATTING_NO_CONTENT
    ])
    table.add_row([
        'TIME TAKEN - CONTENT FORMATTING (min)', '-',
        round((endTime - startTime) / 60, 5)
    ])
    pc.printSucc(table)

    print("\n")
    pc.printWarn(
        '\t\t\t------------------------->>>>>> [ TimeTaken for Content Formatting (min) = {} ]\n'
        .format(round((endTime - startTime), 5) / 60))
    print("\n\n")
async def fetchWithRetry(row, session, csv_out):
    """
        Hits ulr(with retires):
        * if status == 200:
            put content into csv
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
    """

    status = 400
    retry_cnt = 2
    sleep_time = 10
    TIMEOUT = 10

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.weightedcontentfromhtml(
                    res) + row["Title"] + urlstrings
                row["Content"] = text_actions.contentfromhtml(res) + urlstrings
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                if (len(row["Title"]) != 0):
                    if len(row["Content"]) == 0:
                        row["WeightedContent"] = row["Title"]
                        row["Content"] = row["Title"]
                    await write_result(csv_out, row)
                    global WRITTEN_ENTRIES_ASYNC_SCRAPED
                    WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                    pc.printMsg(
                        " \t\t ============== [Scraped] Done Writing into csv for <ID = {}><src= {} > =============== "
                        .format(row["ID"], row["SourceSite"]))
                else:
                    global WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING
                    WRITTEN_ENTRIES_ASYNC_NO_CONTENT_IN_SCRAPING += 1
                    pc.printErr(
                        "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING  for <ID = {}><src= {} > As No Title xxxxxxxxxxxxxxxxxxxxxxxx\n"
                        .format(row["ID"], row["SourceSite"]))
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row["ID"], row["SourceSite"], status,
                            row["Url"][:25], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    if len(row["Content"]) == 0:
        row["WeightedContent"] = row["Title"]
        row["Content"] = row["Title"]
    await write_result(csv_out, row)
    global WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR
    WRITTEN_ENTRIES_ASYNC_ON_URL_ERROR += 1
    pc.printMsg(
        " \t\t\t ============== [Unreachable URL] Done Writing into csv for <ID = {}><src= {} > =============== "
        .format(row["ID"], row["SourceSite"]))
    return row
async def fetchWithRetry(row, session):
    """
        Hits ulr(with retires):
        * if status == 200:
            return resposne ((raw)Content & (raw)WeightedContent in row)
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
        INPUT: `row` is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
    """

    status = 400
    retry_cnt = 2
    sleep_time = 5
    # TIMEOUT = ClientTimeout(total=20)
    TIMEOUT = 20

    while retry_cnt > 0 and status != 200:
        async with session.get(row[6],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            # res = await response.content.read()
            # res = await text_actions.clean_text(str(response.content.read()))
            res = text_actions.clean_text(str(res))
            # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore')                   #FIXME: not working
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime())))
                row_list = list(row)
                row_list[12] = text_actions.weightedcontentfromhtml(res)
                row_list[13] = text_actions.contentfromhtml(res)
                # for i in range(len(row_list)):
                #     row_list[i] = row_list[i].decode("utf-8", "ignore")

                row = tuple(row_list)
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                if (len(row[13]) == 0):
                    global ERR_ASYNC_NO_CONTENT_IN_SCRAPING
                    ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1
                    pc.printErr(
                        "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING  for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n"
                        .format(row[0], row[1]))
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row[0], row[1], status, row[6][:25], sleep_time,
                            retry_cnt))
                await asyncio.sleep(sleep_time)

    pc.printErr(
        "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} "
        .format(row[0], row[1], row[6]))
    global ERR_ASYNC_ON_URL_ERROR
    ERR_ASYNC_ON_URL_ERROR += 1
    pc.printMsg(
        " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== "
        .format(row[0], row[1]))
    return row