예제 #1
0
async def parse(url):

    driver = get_driver()
    page_source = await get_page_source(driver, url)
    driver.close()
    parsed_dict = {url: parse_html(page_source)}

    return parsed_dict
예제 #2
0
파일: update.py 프로젝트: elsid/mosecom-air
def get_source_data(logger):
    source = HtmlSource(logger)
    for station_name in source.get_stations_list():
        try:
            html = source.get_station_html(station_name)
            yield station_name, parse_html(html)
            logger.info('action=[get_source_data] result=[success] '
                        'station=[%s]', station_name)
        except RequestError as error:
            logger.error('action=[get_source_data] result=[fail] station=[%s] '
                         '%s', station_name, make_one_line(error))
        except Exception as error:
            logger.error('action=[get_source_data] result=[error] station=[%s] '
                         'reason=[%s]', station_name, make_one_line(error))
예제 #3
0
def get_source_data(logger):
    source = HtmlSource(logger)
    for station_name in source.get_stations_list():
        try:
            html = source.get_station_html(station_name)
            yield station_name, parse_html(html)
            logger.info(
                'action=[get_source_data] result=[success] '
                'station=[%s]', station_name)
        except RequestError as error:
            logger.error(
                'action=[get_source_data] result=[fail] station=[%s] '
                '%s', station_name, make_one_line(error))
        except Exception as error:
            logger.error(
                'action=[get_source_data] result=[error] station=[%s] '
                'reason=[%s]', station_name, make_one_line(error))
예제 #4
0
def parse_uri(content):
    from datetime import datetime
    start = datetime.now()
    # javascript: scheme
    if content.lower().find('javascript:') == 0:
        parser.process_content(content[len('javascript:'):], contexts.URI_JS)
        content = urllib.unquote(content).decode("utf8")
        js_parser.parse_js(content)

    # data: scheme
    elif content.lower().find('data:') == 0:
        content = content[len('data:'):]

        # placeholder is in content type
        if content.find(",") == -1:
            # invalid format of data: scheme
            parser.process_content(content, contexts.URI_UNKNOWN_DATA)
            return
        parser.process_content(content[0:content.find(",")],
                               contexts.URI_CONTENT_TYPE)

        # extracts content-type, encoding and charset
        # if encoding not found, uses urlencode
        # if encoding urlencode and charset not found, uses utf8
        enctype = "urlencode"
        if content.find(";") != -1 and content.find(",") > content.find(";"):
            content_type = content[0:content.find(";")]
            encoding = content[content.find(";") + 1:content.find(",")]
            # placeholder is in encoding
            parser.process_content(encoding, contexts.URI_UNKNOWN_DATA)
            if encoding.tolower() == "base64":
                enctype = "base64"
            elif encoding.tolower().find("charset=") == 0:
                charset = encoding[0, encoding.tolower().find("charset=")]
            else:
                charset = "utf8"
        else:
            content_type = content[0:content.find(",")]
            charset = "utf8"

        # decode content
        content = content[content.find(",")]
        if enctype == "base64":
            content = b64decode(content)
        else:
            content = urllib.unquote(content).decode(charset)

        # subprocess content according to the content type
        if content_type.lower() == "text/html":
            parser.process_content(content, contexts.URI_HTML_DATA)
            html_parser.parse_html(content)
        elif content_type.lower() == "text/css":
            parser.process_content(content, contexts.URI_CSS_DATA)
            css_parser.parse_css_stylesheet(content)
        elif content_type.lower() == "text/javascript" or content_type.lower(
        ) == "application/x-javascript" or content_type.lower(
        ) == "application/javascript":
            parser.process_content(content, contexts.URI_JS_DATA)
            js_parser.parse_js(content)
        else:
            parser.process_content(content, contexts.URI_OTHER_DATA)

    # other schemes
    else:
        parser.process_content(content, contexts.URI_URL)
    end = datetime.now()
    library.uri_us += end - start
 def __call__(self):
     response = requests.get(DOWNLOAD_URL)
     parse_html()
     self.finished.emit(response.text)
def download_data():
    print('task thread is:', threading.current_thread())
    print('downloading')
    response = requests.get(DOWNLOAD_URL)
    parse_html()
    return response.text
 def _button_clicked(self):
     response = requests.get(DOWNLOAD_URL)
     parse_html()
     self._textarea.setPlainText(response.text)
예제 #8
0
def crawl(crawler_database, start_url):

    crawler_database.connect_db()

    if crawler_database.check_db():  # Restarting the previous crawling process
        print("Restarting previous crawl process.")

    else:  # Starting a new crawling process from start_url

        website = wm.extract_root(start_url)

        if len(website) > 1:
            crawler_database.cursor.execute(
                'INSERT OR IGNORE INTO Websites '
                '(url) VALUES ( ? )', (website, ))

            crawler_database.cursor.execute(
                'INSERT OR IGNORE INTO Pages '
                '(url, html, new_rank) '
                'VALUES ( ?, NULL, 1.0 )', (start_url, ))

            crawler_database.connection.commit()

    # Getting the current website
    crawler_database.cursor.execute('''SELECT url FROM Websites''')

    for row in crawler_database.cursor:
        websites.append(str(row[0]))

    print(websites)

    crawl_counter = 0

    while True:  # Crawl Loop

        if crawl_counter < 1:
            crawl_counter = int(input('How many pages do you want to crawl? '))
            if crawl_counter == 0:
                break

        print("Crawling Iteration:", crawl_counter)
        crawl_counter -= 1

        # Getting a random row where HTML Code and error is NULL
        crawler_database.cursor.execute('SELECT id,url FROM Pages WHERE '
                                        'html is NULL and error is NULL '
                                        'ORDER BY RANDOM() LIMIT 1')

        row = crawler_database.cursor.fetchone()

        if row is None:  # Breaking the loop when there are no row
            print("All pages are retrieved.")
            break

        from_id = row[0]
        url = row[1]

        print("Crawling ->", "Page ID =", from_id, "| URL =", url, end=' ')

        # This query deletes all records which the current page points to.
        crawler_database.cursor.execute('DELETE from Links WHERE from_id=?',
                                        (from_id, ))

        try:
            document = wm.open_url(url)

            if wm.get_http_status_code(
                    document
            ) != 200:  # Checking HTTP Status Code (200 means success)
                print("Error occured while loading the page: ",
                      wm.get_http_status_code(document))

                crawler_database.cursor.execute(
                    'UPDATE Pages SET error=? WHERE url=?',
                    (wm.get_http_status_code(document), url))

            if 'text/html' != wm.get_http_content_type(
                    document):  # Checking the Content-Type
                print("This URL is not an HTML file.")
                crawler_database.cursor.execute(
                    'DELETE FROM Pages WHERE url=?', (url, ))
                crawler_database.cursor.execute(
                    'UPDATE Pages SET error=0 WHERE url=?', (url, ))
                crawler_database.connection.commit()
                continue

            soup = hp.parse_html(
                document
            )  # Using BeautifulSoup to repair and parse HTML document
            html_code = str(soup)

        except HTTPError:
            print("Unable to retrieve or parse page")
            crawler_database.cursor.execute(
                'UPDATE Pages SET error=-1 WHERE url=?', (url, ))
            crawler_database.connection.commit()
            continue

        # Inserting the currently crawling page to Pages table with initial rank 1.0
        crawler_database.cursor.execute(
            'INSERT OR IGNORE INTO Pages (url, html, new_rank)'
            ' VALUES ( ?, NULL, 1.0 )', (url, ))

        # Updating the currently crawling pages html field
        crawler_database.cursor.execute('UPDATE Pages SET html=? WHERE url=?',
                                        (html_code, url))
        crawler_database.connection.commit()

        a_tags = hp.get_tags(
            soup, "a"
        )  # Getting all <a> tags because they are the links to other pages
        href_count = 0

        for tag in a_tags:
            href_attribute = hp.get_attribute(tag, "href")

            if not wm.does_exist(href_attribute):
                continue

            if wm.is_image(href_attribute):
                continue

            if href_attribute.endswith('/'):
                href_attribute = href_attribute[:-1]

            if wm.is_relative(href_attribute):
                href = wm.make_absolute(
                    url, href_attribute)  # Making the absolute URL

            if href.find("#") > 1:
                pointer = href.find("#")
                href = href[:pointer]  # Cutting the fragment part in the URL

            is_found = False
            for website in websites:
                if href.startswith(
                        website
                ):  # If href is in the Websites, breaking the loop
                    is_found = True
                    break
            if not is_found:
                continue

            # Inserting the href to Pages table with initial rank 1.0
            crawler_database.cursor.execute(
                'INSERT OR IGNORE INTO Pages (url, html, new_rank) '
                'VALUES ( ?, NULL, 1.0 )', (href, ))
            href_count = href_count + 1
            crawler_database.connection.commit()

            # Getting the id of the href
            crawler_database.cursor.execute(
                'SELECT id FROM Pages WHERE url=? LIMIT 1', (href, ))
            row = crawler_database.cursor.fetchone()
            if row is None:
                print("Couldn't retrieve id of href ->", href)
                continue

            to_id = row[0]

            # Inserting from_id (currently crawling pages id) and to_id (href's id) to Links Table
            crawler_database.cursor.execute(
                'INSERT OR IGNORE INTO Links (from_id, to_id) '
                'VALUES ( ?, ? )', (from_id, to_id))

        print("| links found =", href_count, "\n")

    crawler_database.disconnect_db()
 def process(self):   
     try:
         #step 1: get text content and title
         print("INFO: Getting the text content of web page.");
         title, content = content_extraction.get_text(self.__url);
         if (title == None or content == None):
             raise ValueError("Could not retrieve text data.");
         #step 2: Get all title, h1, meta tags of the web page.
         print("INFO: Getting the title, meta and h1 tag content of web page.");
         title_content, meta_content, h1_tag_content = html_parser.parse_html(self.__url);
         #step 3: get noun words and phrases (from wiki search)
         print("INFO: Extracting single word nouns from text content and searching related phrases from wikipedia articles.");
         print("INFO: This might take a few seconds.");
         nouns = noun_extractor.get_nouns(title, content);
         is_input_small = False;
         if(len(nouns) < 15):
             is_input_small = True;
         #step 4: ranking - giving scores to each word based on several factors.
         print("INFO: Ranking words and phrases based on factors like occurrence, frequency etc.");
         ranked_words = [];
         for w in nouns:
             ranked_word = ranking.do_rank(self.__url, w, content, title_content, meta_content, h1_tag_content);
             ranked_words.append(ranked_word);
         #step 5: grouping of similar phrases and  eliminating repetition for more diverse keywords
         print("INFO: Grouping similar words to clusters and getting the most ranked words from each cluster");
         print("INFO: This might take a few seconds.");
         words, clusters = k_means_grouper.get_clusters(ranked_words);
         cluster_count = 0;
         final_ranked_words = [];
         #Add all keywords that did not have a match in data-set and hence cannot be grouped.
         final_ranked_words.extend(words)
         #sort each data clustered based on ranks and get the highest ranked data from a cluster
         for data_cluster in clusters:
             data_cluster.sort(key=operator.attrgetter('score'), reverse=True)
             #Taking the most ranked word of the cluster
             final_ranked_words.append(data_cluster[0]);
         print("INFO: Sorting all the words and phrases based on the ranking scores and getting Top 15 words.");
         #sort the final list of ranked words
         final_ranked_words.sort(key=operator.attrgetter('score'), reverse=True)
         count = 0;
         key_words = [];
         #Will show first 15 ranked keywords
         for rw in final_ranked_words:
             count += 1;
             if len(key_words) == 15:
                 break;
             else:
                 if rw.isUpper:
                     key_words.append(rw.getword().upper());
                 else:
                     key_words.append(rw.getword().title());
         print("INFO: Success, check your words in the UI.");
         if(count < 15):
             return {'words' : key_words, "is_input_small" : is_input_small};
         else:
             return {'words' : key_words};
     except ValueError as e:
         print(e);
         return {"error": e};
     except Exception as e:
         print("ERROR: Some error occurred.");
         print(e);
         return {"error": "Sorry, something went wrong! Please verify the URL."};
def download_data():
    response = requests.get(DOWNLOAD_URL)
    parse_html()
    return response.text
 def _populate_textarea(self):
     parse_html()
     self._textarea.setPlainText(unicode(self._reply.readAll(), 'utf-8'))
예제 #12
0
def parse(content):
    content = deduplicate(content)
    patch_parsers()
    html_parser.parse_html(content)
    return check_sequences(content)