Exemplo n.º 1
0
def store_date(org,dst,date_str):
    url=Url()
    url1=url.getUrl(org,dst,date_str)

    print(url1)

    pass
Exemplo n.º 2
0
    def __init__(self, base, artist, song, separator='-', extension='.html'):
        Url.__init__(self)

        self.base = base
        self.separator = separator
        self.extension = extension
        self.artist = UrlHelper.remove_url_nonesense(artist.lower())
        self.song = UrlHelper.remove_url_nonesense(song.lower())
Exemplo n.º 3
0
class Source:
    def __init__(self, url, name, lookupId):
        self.url = Url(url)
        self.name = name
        self.lookupId = lookupId
        self.categories = []

    def getHumanReadableName(self):
        if(self.url.getDomain() in DOMAIN_HUMAN_READABLE):
            return DOMAIN_HUMAN_READABLE[self.url.getDomain()] + ' - ' + self.name
        return self.name

    def isAggregator(self):
        return (self.url.getDomain() in AGGREGATOR_DOMAIN)

    def __repr__(self):
        return "<Source(%s, %s)>" % (self.name.encode('ascii', 'ignore'), self.url)
Exemplo n.º 4
0
 def add_url(self, url, rank=0, method="url"):
     url = Url(url)
     try:
         self.db.seeds.insert_one({
                             "date": [self.date],
                             "url": url.url,
                             "url_id" : url.url_id,
                             "url_info": url.export(),
                             "title": None,
                             "description": None,
                             "rank": rank,
                             "source_url": None,
                             "depth": 0,
                             "method": method
                             })
                                 
     except pymongo.errors.DuplicateKeyError:
         pass
     return self
Exemplo n.º 5
0
    def service_obj_creator(self):

        f = open("URLs.txt")
        var = 1
        url_obj_list = []

        while var == 1:
            line = str(f.readline())
            if "end" in line:
                break

            urllist = line.split()
            print(urllist)
            url_obj_list.append(
                Url(urllist[1], urllist[0], self.usr, self.password))

        return url_obj_list
Exemplo n.º 6
0
	def add(self, url):
		#新加url,判断是否爬取过,如果爬取过不加入
		if self._filter(url):
			url_type = 'explore'
			if re.search(r'explore', url):
				url_type = 'explore'
			if re.search(r'topics', url):
				url_type = 'topics'
			if re.search(r'/question/\d+/*$', url):
				url_type = 'question'
			if re.search(r'/question/\d+/answer/\d+/?$', url):
				url_type = 'answer'
			if re.search(r'/people/\w+/?/$', url):
				url_type = 'people'

			u_obj = Url(url=url, url_type=url_type)

			self.urls.append(u_obj)
Exemplo n.º 7
0
    def post(self):
        """Post short URL.
        Returns a short URL for a long URL.
        :param object request_body: the long URL with its meta data
        :rtype: json: contains the long URL, the short URL and the metadata
        """
        body = request.get_json()

        long_url = body["long_url"]
        custom_salt = get_key(body, 'custom_salt')
        custom_url = get_key(body, 'custom_url')
        tag = get_key(body, 'tag')
        metadata = get_key(body, 'metadata')
        url_type = body["type"][0]

        if url_type == 'iota':
            url = IotaUrl(address=long_url,
                          tag=tag,
                          metadata=metadata,
                          custom_salt=custom_salt)
        elif url_type == 'document':
            url = DocumentUrl(document_hash=long_url,
                              tag=tag,
                              metadata=metadata,
                              custom_salt=custom_salt)
        else:
            url = Url(long_url=long_url,
                      tag=tag,
                      metadata=metadata,
                      custom_salt=custom_salt)

        if custom_url:
            url.random_id = custom_url

        url_manager = UrlManager()
        message = url_manager.publish_url(url=url)
        message = message.json
        status = 200

        response = app.response_class(response=json.dumps(message),
                                      status=status,
                                      mimetype='application/json')

        return response
Exemplo n.º 8
0
    def urls_from_content(self, content):
        """Retrieves the URLs from a request content.
    
        Keyword arguments:
        content -- page content, as retrieved via urllib urlopen or compatible with lxml.
        """
        if content:
            bs = BeautifulSoup(content, 'lxml')
            urls = []
        
            for u in bs.findAll('a'):
                new_url = Url(u.get('href'), domain = self.crawl_url.netloc(), protocol=self.crawl_url.proto())
                if new_url == None or new_url == '':
                    error("Failed to process {}".format(u.get('href')))
                else:
                    urls.append(new_url)

            return urls
        else:
            return []
Exemplo n.º 9
0
    def validate_url(self, short_url: str, long_url: str, custom_salt: str = None):
        random_id = short_url.split("/")[-1]

        url = Url()
        url.random_id = random_id

        url_transactions = self.node_manager.retrieve_transactions(address=url.address)

        if not url_transactions:
            return False

        for url_transaction in url_transactions:
            url_to_validate = Url(custom_salt=custom_salt)
            url_to_validate.from_message(url_transaction.message)
            if url_to_validate.long_url == long_url:
                if url_to_validate.is_valid:
                    return True

        return False
Exemplo n.º 10
0
    def base(self):
        """
        Returns the base URI for this response

        :rtype: class:`Url` or None
        """

        url = None
        if self.header('Content-Base'):
            url = self.header('Content-Base')
        if self.header('Content-Location'):
            url = self.header('Content-Location')
        if url is None and self.request:
            url = self.request.url

        if not url:
            return None

        if not isinstance(url, Url):
            url = Url(url)

        return url
Exemplo n.º 11
0
    def test_url_protocol_parsing(self):

        self.assertEqual(Url("https://www.google.com").proto(), "https")
        self.assertEqual(Url("http://www.google.com").proto(), "http")
        self.assertEqual(Url("https://google.com").proto(), "https")
Exemplo n.º 12
0
 def doc(self) -> Url:
     return Url('https://wiki.micdoodle8.com/wiki/Galacticraft')
Exemplo n.º 13
0
 def __init__(self, urls):
     self.urls = []
     for url in urls:
         self.urls.append(Url(url))
def test_get_path_from_url_root_directory():
    assert get_path_from_url('/root/path',
                             Url('http://my.DOMAIN.com/this/file.html')
                             ) == '/root/path/my.domain.com/this/file.html%$%'
Exemplo n.º 15
0
 def url(self) -> Url:
     return Url('https://minecraft.curseforge.com')
Exemplo n.º 16
0
from url import Url
from document import Document

if __name__ == "__main__":
    raw_file = open(DATA_PATH+"rawlist.data")
    url_file = open(DATA_PATH+"url.index")
    doc_file = open(DATA_PATH+"doc.index")

    url_list = []
    doc_list = []

    for line in url_file.readlines():
        line = line.split('\t')
        summary = line[0]
        docId = int(line[1])
        url = Url()
        url.m_summary=summary
        url.m_docId=docId
        url_list.append(url)
    
    for line in doc_file.readlines():
        line = line.split('\t')
        docId = int(line[0])
        pos = int(line[1])
        document = Document()
        document.m_docId = docId
        document.m_pos = pos
        doc_list.append(document)

    index_dict = {}
    title_index_dict = {}
Exemplo n.º 17
0
 def url_item_insert(self, html, purl, table_name='url_item'):
     for item in Url.url_todo(html, purl):
         sql = self.db.sql_insert(table_name, item)
         self.db.execsql(sql)  
Exemplo n.º 18
0
 def get_url(self, url_tuple):
     with open(self.url_file(url_tuple), 'r') as file:
         return Url(json=json.load(file))
Exemplo n.º 19
0
 def __init__(self):
     self._logger = logger
     self._url = Url()
Exemplo n.º 20
0
class PycURLLibrary():
    """PycURLLibrary is a library for functional testing with URL syntax, supporting DICT, FILE, FTP, FTPS, Gopher, HTTP, 
    HTTPS, IMAP, IMAPS, LDAP, LDAPS, POP3, POP3S, RTMP, RTSP, SCP, SFTP, SMTP, SMTPS, Telnet and TFTP. 
    PycURLLibrary supports SSL certificates and more.

    PycURLLibrary is based on PycURL [http://pycurl.sourceforge.net/], 
    PycURL is a Python interface to libcurl [http://curl.haxx.se/libcurl/].
    
    xml.etree.ElementTree [http://docs.python.org/2/library/xml.etree.elementtree.html] is used for XML operations.

    Supported XPath syntax (from Python v2.7.5 documentation):

    | Syntax | Meaning |
    | tag    | Selects all child elements with the given tag. For example, spam selects all child elements named spam, spam/egg selects all grandchildren named egg in all children named spam. |
    | *      | Selects all child elements. For example, */egg selects all grandchildren named egg. |
    | .      | Selects the current node. This is mostly useful at the beginning of the path, to indicate that it’s a relative path. |
    | //     | Selects all subelements, on all levels beneath the current element. For example, .//egg selects all egg elements in the entire tree. |
    | ..     | Selects the parent element. |
    | [@attrib] | Selects all elements that have the given attribute. |
    | [@attrib='value'] | Selects all elements for which the given attribute has the given value. The value cannot contain quotes. |
    | [tag]  | Selects all elements that have a child named tag. Only immediate children are supported. |
    | [position] | Selects all elements that are located at the given position. The position can be either an integer (1 is the first position), the expression last() (for the last position), or a position relative to the last position (e.g. last()-1). |
    """
    
    ROBOT_LIBRARY_VERSION = VERSION
    ROBOT_LIBRARY_SCOPE = "TEST CASE"
    ROBOT_LIBRARY_DOC_FORMAT = "ROBOT"

    def __init__(self):
        self._logger = logger
        self._url = Url()
        
    def verbose(self):
        """Makes the fetching more verbose/talkative.
        
        Mostly useful for debugging. A line starting with '>' means "header data" sent by curl, 
        '<' means "header data" received by curl that is hidden in normal cases, and a line starting with '*' means additional info provided by curl.
        
        Note that if you only want HTTP headers in the output, -i, --include might be the option you're looking for.
        
        If you think this option still doesn't give you enough details, consider using --trace or --trace-ascii instead.
        
        This option overrides previous uses of --trace-ascii or --trace.         
        """
        self._url.set_verbose(True)
        
#    def no_buffer(self):
#        """Disables the buffering of the output stream.
#        
#        In normal work situations, curl will use a standard buffered output stream that will have the effect that it will output the data in chunks, 
#        not necessarily exactly when the data arrives. Using this option will disable that buffering.
#        Note that this is the negated option name documented. You can thus use --buffer to enforce the buffering. 
#        """
        
    def server_connection_establishment_timeout(self, timeout):
        """The maximum time in seconds that you allow the connection to the server to take (long value).
        This only limits the connection phase, once it has connected, this option is of no more use. 
        Set to zero to switch to the default built-in connection timeout - 300 seconds.
        """
        self._url.get_context().set_server_connection_establishment_timeout(long(str(timeout)))
        
    def insecure_ssl(self):
        """(SSL) This option explicitly allows curl to perform "insecure" SSL connections and transfers.
        
        All SSL connections are attempted to be made secure by using the CA certificate bundle installed by default. 
        This makes all connections considered "insecure" fail unless -k, --insecure is used. 
        """
        self._url.set_insecure(True)
        
    def request_method(self, requestMethod):
        """ Set's the request method. Default's to GET if Post Fields keyword is used POST is used
        | Method |
        | GET |
        | POST |
        | PUT |
        | DELETE |
        """
        self._url.get_context().set_request_method(requestMethod)
        
    def add_header(self, header):
        """(HTTP) Extra header to use when getting a web page.
        
        Each *Add Header* keyword is equivalent for one <-H, --header> argument with curl
        
        Examples:
        | Add Header | Content-Type: text/xml; charset=UTF-8 |
        | Add Header | Frame.Version:3.0 |
        """
        self._logger.info('Header %s' % header)
        self._url.get_context().add_header(str(header))
        
    def headers_file(self, headerFile):
        """(HTTP) Extra headers to use when getting a web page.
        
        *headerFile* contains all headers.
        
        One line is one header. Note do not make line feed after last header.

        Example:
        | Headers File | /data/headers.txt |
        
        Example of content of *headerFile*:
        | Version: 2 |
        | Content-Type: text/xml; charset=UTF-8 |        
        """
        headers = [line.rstrip() for line in open(headerFile, 'r')] 
        self._logger.info('Headers %s' % headers)
        self._url.get_context().set_headers(headers)
        
        
    def post_fields(self, postFields):
        """(HTTP) Sends the specified data in a POST request to the HTTP server, 
        in the same way that a browser does when a user has filled in an HTML form and presses the submit button. 
        This will cause curl to pass the data to the server using the content-type application/x-www-form-urlencoded.

        Equivalent for <--data> argument 
        
        Example:
        | Post Fields | pizza=Quattro+Stagioni&extra=cheese |
        """
        self._url.set_post_fields(postFields)
        if postFields is not None:
            self._url.get_context().set_request_method('POST')

    def post_fields_file(self, postFieldsFile):
        """(HTTP) Sends the specified data in a POST request to the HTTP server, 
        in the same way that a browser does when a user has filled in an HTML form and presses the submit button. 
        This will cause curl to pass the data to the server using the content-type application/x-www-form-urlencoded.

        Equivalent for <--data> @argument 
        
        Example:
        | Post Fields File | /data/message.txt |
        """
        f = open(postFieldsFile, 'r')
        postFields = f.read()
        f.close()
        self._url.set_post_fields(postFields)
        self._url.get_context().set_request_method('POST')

    def set_url(self, url):
        """Specify a URL to fetch.
        """
        self._url.get_context().set_url(str(url))
        
    def ca_path(self, cacertDirectory):
        """(SSL) Tells curl to use the specified certificate directory to verify the peer. 
        Multiple paths can be provided by separating them with ":" (e.g. "path1:path2:path3"). 
        The certificates must be in PEM format. 
        
        Equivalent for <--capath> argument with curl
        """
        self._url.get_context().set_capath(str(cacertDirectory))
        
    def client_certificate_file(self, cert):
        """(SSL) Tells curl to use the specified client certificate file when getting a file with HTTPS, 
        FTPS or another SSL-based protocol. The certificate must be in PEM format

        Equivalent for <--cert> argument with curl
        """
        self._url.get_context().set_client_certificate_file(str(cert))
        
    def private_key_file(self, key):
        """(SSL/SSH) Private key file name. 
        Allows you to provide your private key in this separate file. 

        Equivalent for <--key> argument with curl
        """
        self._url.get_context().set_private_key_file(str(key))
        
    def perform(self):
        """Perform curl perform.
        """
        self._url.perform()
        
    def response(self):
        """Get response from latest perform result
        """
        return self._url.get_context().get_response()

    def response_headers(self):
        """Get response headers from latest perform result for protocols having headers preceding the data (like HTTP)
        """
        return self._url.get_context().get_response_headers()

    def parse_xml(self):
        """Parses an XML section of the response. Returns an root Element instance.
        """
        return self._url.get_context().parse_response_xml()

    def xml_root_element(self):
        """Returns the result root Element instance of `Parse Xml` keyword.
        """
        return self._url.get_context().get_xml_root_element()
    
    def find_elements(self, element, xpath):
        """Returns a list containing all matching elements in document order
        
        Examples:
        | Find Elements | ${root} | .//{http://ws.poc.jivalo/hello/v1}customer |
        | Find Elements | ${root} | .//name |
        """
        assert element is not None, \
            'Element is Null.' 
        xp = str(xpath)
        return element.findall(xp)
    
    def find_first_element(self, element, xpath):
        """Finds the first subelement matching *xpath*. Match may be a _tag name_ or _path_. Returns an element instance or None.
        
        Examples:
        | Find First Element | ${root} | .//{http://ws.poc.jivalo/hello/v1}customer |
        | Find First Element | ${root} | .//name |
        """
        assert element is not None, \
            'Element is Null.' 
        xp = str(xpath)
        return element.find(xp)
    
    def should_contain_element(self, element, xpath):
        """Fails if the *element* does not contain *xpath* element       
        Examples:
        | Should Contain Element | ${root} | .//{http://ws.poc.jivalo/hello/v1}customer |
        | Should Contain Element | ${root} | .//name |
        """
        elements = self.find_elements(element, xpath)
        assert elements, \
            'Element "%s" contains not XPaht element "%s".'  % (
            element.tag, xpath)
            
    def element_should_contain(self, element, text):
        """Fails if the *element* text value does not contain *text*      
        Examples:
        | Element Should Contain | ${elem} | Hello, world! |
        """
        assert text in element.text, \
            'Element "%s" does not contains text "%s".'  % (
            element.tag, text)
            
    def element_should_match(self, element, text):
        """Fails if the *element* text value does not match *text*      
        Examples:
        | Element Should Match | ${elem} | Hello, world! |
        """
        assert text == element.text, \
            'Element "%s" does not match text "%s".'  % (
            element.tag, text)
            
    def http_response_status(self):
        """Get response status from latest HTTP response status line
        """
        return self._url.get_context().get_response_status()

    def response_status_should_contain(self, text):
        """Fails if the _Response Status_  does not contain *text*      
        Examples:
        | Response Status Should Contain | 200 |
        """
        assert str(text) in str(self.http_response_status()), \
            'Response Status "%s" does not contains text "%s".'  % (
            self.http_response_status(), text)
        
    def log_response(self, log_level='INFO'):
        """
        Logs the response of the URL transfer.

        Specify *log_level* (default: "INFO") to set the log level.
        """        
        if self.response():
            self._logger.write("Response body:", log_level)
            self._logger.write(self.response(), log_level)
        else:
            self._logger.debug("No response received", log_level)

    def log_response_headers(self, log_level='INFO'):
        """
        Logs the response headers for protocols having headers preceding the data (like HTTP), line by line.

        Specify *log_level* (default: "INFO") to set the log level.
        """        
        if self.response_headers():
            self._logger.write("HTTP Response headers:", log_level)
            for header in self.response_headers():
                self._logger.write(header, log_level)
        else:
            self._logger.debug("No HTTP response headers received", log_level)
        

    def log_http_response_status(self, log_level='INFO'):
        """
        Logs the HTTP response header status line.

        Specify *log_level* (default: "INFO") to set the log level.
        """        
        if self.http_response_status():
            self._logger.write("HTTP Response status:", log_level)
            self._logger.write(self.http_response_status(), log_level)
        else:
            self._logger.debug("No HTTP response status received", log_level)
        
    def log_version(self, log_level='INFO'):
        """
        Logs the PycURLLibrary Version.

        Specify *log_level* (default: "INFO") to set the log level.
        """
        self._logger.write("PycURLLibrary version %s" % (self.ROBOT_LIBRARY_VERSION), log_level)
Exemplo n.º 21
0
 def __init__(self, url, name, lookupId):
     self.url = Url(url)
     self.name = name
     self.lookupId = lookupId
     self.categories = []
Exemplo n.º 22
0
 def __init__(self):
     Url.__init__(self)
Exemplo n.º 23
0
 def setUp(self) -> None:
     self.kidega_site = DistSite('Kidega', 'query', 'https://kidega.com/arama', 'searcHeadArea.txt', 20, 'page')
     self.idefix_site = DistSite('Idefix', 'Q', 'https://idefix.com/search', 'searcHeadArea.txt', 36, 'Page')
     self.url_kidega = Url(self.kidega_site, 'python')
     self.url_idefix = Url(self.idefix_site, 'python')
Exemplo n.º 24
0
    def test_url_subdomain_parsing(self):

        self.assertEqual(
            Url("http://www.fake.google.com").subdomain(), "www.fake")
        self.assertEqual(Url("http://fake.google.com").subdomain(), "fake")
Exemplo n.º 25
0
 def test_url_suffix_parsing(self):
     self.assertEqual(Url("http://www.google.com").suffix(), "com")
     self.assertEqual(Url("http://www.google.co.uk").suffix(), "co.uk")
Exemplo n.º 26
0
def crawl(url, workers=None, limit_to_domain=True, robot=False, single=False):
    """Crawls a given url to determine its link tree.
    
    Keyword arguments:
    url -- the url to crawl
    workers -- the number of processes to spawn (default cpu_count() * 2)
    limit_to_domain -- if the crawler should be limited to the url domain (default True)
    """
    u = Url(url)
    domain = u.domain()

    # Establish communication queues
    tasks = multiprocessing.JoinableQueue()
    results = multiprocessing.Queue()

    # Start consumers
    if robot:
        rob = Robot(u.url())
        num_consumers = 1
    elif workers:
        num_consumers = workers
    else:
        num_consumers = multiprocessing.cpu_count() * 2

    debug('Creating {} consumers'.format(num_consumers))
    consumers = [Consumer(tasks, results) for i in range(num_consumers)]

    for w in consumers:
        w.start()

    num_jobs = 1
    tasks.put(CrawlerTask(url))

    # Keeps executing while there are URLs to process
    while num_jobs > 0:

        debug("Number of jobs: {}".format(num_jobs))

        debug("Fetching results")

        result = results.get()

        debug("Got results")

        if limit_to_domain:
            # Filter urls based on domain (this could be merged to previous filter)
            domain_urls = list(
                filter(lambda x: x.domain() == domain, result['urls']))
        else:
            domain_urls = result['urls']

        # Filter urls based on freshness (i.e., do not parse repeated urls)
        new_urls = list(filter(lambda x: is_new_url(x.url()), domain_urls))

        # Print
        [(lambda x: info("{} -> {}".format(result['parent'], x.url())))(r)
         for r in new_urls]

        debug("URL stats: Total {} Domain {} New {}".format(
            len(result['urls']), len(domain_urls), len(new_urls)))

        for r in new_urls:

            if robot and rob.should_block(r):
                info("Blocked access to {}".format(r.url()))
                continue

            print_url(r.url())

            if not single:
                debug('Scheduling: {}'.format(r.url()))

                tasks.put(CrawlerTask(r.url()))

                if robot and rob.throttle_time():
                    info('Sleeping {} seconds'.format(rob.throttle_time()))
                    sleep(rob.throttle_time())

                num_jobs += 1

        num_jobs -= 1

        debug("Done scheduling")

    # This stops all the processes
    for i in range(num_consumers):
        tasks.put(None)

    # Waits for the correct killing of the processes
    tasks.join()

    debug("Done")
Exemplo n.º 27
0
 def resolve_download_url(self, url: Url) -> Url:
     unshortener = AdfLy()
     return Url(unshortener.unshorten(url))
Exemplo n.º 28
0
from __future__ import unicode_literals
from url import Url
import os

url = Url(
    'http://www.timesoccer.com/video/burnley-vs-bournemouth-live-streaming-highlights.html'
)

url.youtube_dl()

os.startfile(url.video_path)
Exemplo n.º 29
0
 def setUp(self) -> None:
     self.dist1 = DistSite('Kidega', 'query', 'https://kidega.com/arama',
                           'searcHeadArea.txt', 20, 'page')
     self.url1 = Url(self.dist1, 'python')
     self.page1 = Page(self.url1)
     self.pager1 = Pager(self.page1)
Exemplo n.º 30
0
 def extract(self, response, depth=0, filters=True):
     article = {}
     html = response.text
     url = response.url
     url = Url(url)
     article["url"] = url.url
     article["url_info"] = url.export()
     article["url_id"] = url.url_id
     article["depth"] = depth
     article["type"] = response.headers['content-type']
     article["date"] = self.date
     article["encoding"] = response.encoding.lower()
     
     article["status"] = True
     if url.status:
         article_txt = lxml_extractor(html, url)
         article["title"] = self.extract_title(html)
         article["meta"] = self.extract_meta(html)
         article["keywords"] = self.extract_keywords(article["meta"])
         if filters:
             if self.check_lang(article_txt):
                 if self.check_query(article_txt):
                     article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
                     article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
                     outlinks = self.extract_outlinks(html, url, depth)
                     article["citeds_url"] = [n["url"] for n in outlinks]
                     article["cited_url_ids"] = [n["url_id"] for n in outlinks]
                     article["outlinks"] =  outlinks
                     article["lang"] = self.page_lang
                     return article
                     
                 else:
                     if self.check_query(article["title"]):                            
                         article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
                         article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
                         outlinks = self.extract_outlinks(html, url, depth)
                         article["cited_urls"] = [n["url"] for n in outlinks]
                         article["cited_url_ids"] = [n["url_id"] for n in outlinks]
                         article["outlinks"] =  outlinks
                         article["lang"] = self.page_lang
                         article = self.extract_page(article, article_txt, html)
                         article["lang"] = self.page_lang
                         return article
                     else:
                         article["status"] = False
                         article["status_code"] = 900
                         article["msg"] = "Search expression not found"
                         return article
             else:
                 if self.check_lang(article["title"]):                        
                     article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
                     article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
                     outlinks = self.extract_outlinks(html, url, depth)
                     article["cited_urls"] = [n["url"] for n in outlinks]
                     article["cited_url_ids"] = [n["url_id"] for n in outlinks]
                     article["outlinks"] =  outlinks
                     article["lang"] = self.page_lang
                     return article
                 else:
                     article["status"] = False
                     article["status_code"] = 1000
                     article["msg"] = "Lang is invalid"
                     article["lang"] = self.page_lang
                     return article
         else:
             self.check_lang(article_txt)
             article["html_file"] = self.store_file(article["url_id"], html, fmt="html")
             article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt")
             outlinks = self.extract_outlinks(html, url, depth)
             article["cited_urls"] = [n["url"] for n in outlinks]
             article["cited_url_ids"] = [n["url_id"] for n in outlinks]
             article["outlinks"] =  outlinks
             article["lang"] = self.page_lang
             return article
     else:
         article["status"] = False
         article["error"] = "Invalid url"
         article["status_code"] = 800
         return article
def test_get_path_from_url_empty_path():
    # test with empty path urls
    assert get_path_from_url(
        '', Url('http://www.github.com')) == 'github.com/index.html%$%'
Exemplo n.º 32
0
 def doc(self) -> Url:
     return Url('https://pixelmonmod.com/wiki')
def test_get_path_from_url_with_querystring():
    assert get_path_from_url('', Url('http://youtube.com/watch?v=ghsu3u43')
                             ) == 'youtube.com/watch?v=ghsu3u43%$%'
Exemplo n.º 34
0
 def __init__(self, url):
     self.url = Url(urljoin(url, '/robots.txt'))
     self.rerp = RobotExclusionRulesParser()
     self.rerp.user_agent = 'Mozilla/5.0'
     self.rerp.fetch(self.url.url())
Exemplo n.º 35
0
import json
import requests

from data import *
from url import Url

for url_to_check in urls_to_check:

    for i, url in enumerate(url_to_check['urls']):
        print(url)

        for j, child in enumerate(Url(url).get_children()):
            print(f'{i}.{j} - Checking {child}')

            if child.response_status() != 200 or j == 3:

                body = \
                    f'You have a broken link at {url}\n' + \
                    f'- Link: {child}\n' + \
                    '\n\n' + \
                    'This message was automatically created by <https://github.com/matheusvanzan/broken-links-monitor|broken-links-monitor>'

                response = requests.post(url_to_check['channel'],
                                         json.dumps({'text': body}))

                print(response)
Exemplo n.º 36
0
 def url(self) -> Url:
     return Url('https://micdoodle8.com')
Exemplo n.º 37
0
    def __init__(self, url, domain=None, protocol=None):

        self.crawl_url = Url(url, domain=domain, protocol=protocol)
Exemplo n.º 38
0
 def url(self) -> Url:
     return Url('https://reforged.gg')
Exemplo n.º 39
0
node = html.fromstring(index)

start = time.time()

mydb = mysql.connector.connect(host="localhost",
                               user="******",
                               passwd="root",
                               database="crawel")

cur = mydb.cursor()

#threads = []
for i in node.xpath("//body//a"):
    url = i.get("href")
    ourl = Url(url)
    sql = "insert into t_url (url, created_date) values ('%s', '%s')" % (
        ourl.url, ourl.created_date)
    print sql
    cur.execute(sql)
    #t = threading.Thread(target=downloader.download,args=(root, url, basedir))
    #t.start()
    #threads.append(t)

mydb.commit()
cur.close()
mydb.close()

#main thread waiting for sub thread
#for t in threads:
#    t.join()
Exemplo n.º 40
0
def getUrl(date_str,detal_days=30):
    url_obj=Url()
    url_list=url_obj.getAllUrl(date_str,detal_days)
    return url_list