Exemplo n.º 1
0
def seach(urllist):
    def process_keyurl(keyurl):
        if keyurl is not None:
            for key, urllist in keyurl.iteritems():
                for url in urllist:
                    urlinfo = gethtml.process_url(url)

                    if urlinfo is None:
                        continue

                    list, keyurl1 = urlinfo

                    if list is not None:
                        gethtml.collection.insert({'key':key, 'url':url, 'timetmp':time.time()})

                    if keyurl1 is not None:
                        process_keyurl(keyurl1)

    def process_urllist(url_list):
        for url in url_list:
            #print url,"sub url"
            urlinfo = gethtml.process_url(url)

            if urlinfo is None:
                continue

            list, keyurl = urlinfo

            if list is not None:
                process_urllist(list)

            if keyurl is not None:
                process_keyurl(keyurl)

            time.sleep(0.1)


    suburl = []
    subkeyurl = {}

    for url in urllist:
        print url, "root url"

        urlinfo = gethtml.process_url(url)

        if urlinfo is None:
            continue

        list, keyurl = urlinfo

        suburl.extend(list)
        subkeyurl.update(keyurl)

    try:
        process_urllist(suburl)
        process_keyurl(subkeyurl)

    except:
        import traceback
        traceback.print_exc()
Exemplo n.º 2
0
    def process_keyurl(keyurl):
        if keyurl is not None:
            for key1, urllist in keyurl.iteritems():
                for url in urllist:
                    #print url
                    if url[len(url) - 1] == '/':
                        url = url[0:-1]
                        #print url

                    #print url

                    urlinfo = gethtml.process_url(url)

                    if urlinfo is None:
                        continue

                    if isinstance(urlinfo, str) and urlinfo == "url is be processed":
                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True)
                    else:
                        list, keyurl1 = urlinfo

                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True)

                        if keyurl1 is not None:
                            process_keyurl(keyurl1)
Exemplo n.º 3
0
    def process_urllist(url_list):
        for url in url_list:
            urlinfo = gethtml.process_url(url)

            if urlinfo is None:
                continue

            list, keyurl = urlinfo

            if list is not None:
                for url in list:
                    gethtml.process_url(url)

            process_keyurl(keyurl)

            time.sleep(0.1)
Exemplo n.º 4
0
    def process_urllist(url_list):
        for url in url_list:
            urlinfo = gethtml.process_url(url)

            if urlinfo is None:
                continue

            list, keyurl = urlinfo

            if list is not None:
                 for url in list:
                     gethtml.process_url(url)

            process_keyurl(keyurl)

            time.sleep(0.1)
Exemplo n.º 5
0
    def process_keyurl(keyurl):
        if keyurl is not None:
            for key1, urllist in keyurl.iteritems():
                for url in urllist:
                    #print url
                    if url[len(url) - 1] == '/':
                        url = url[0:-1]
                        #print url

                    #print url

                    urlinfo = gethtml.process_url(url)

                    if urlinfo is None:
                        continue

                    if isinstance(urlinfo,
                                  str) and urlinfo == "url is be processed":
                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({
                            'key': key,
                            'url': url
                        }, {
                            '$set': {
                                'key': key,
                                'url': url,
                                'timetmp': time.time()
                            }
                        }, True)
                    else:
                        list, keyurl1 = urlinfo

                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({
                            'key': key,
                            'url': url
                        }, {
                            '$set': {
                                'key': key,
                                'url': url,
                                'timetmp': time.time()
                            }
                        }, True)

                        if keyurl1 is not None:
                            process_keyurl(keyurl1)
Exemplo n.º 6
0
    def process_urllist(url_list):
        for url in url_list:
            #print url,"sub url"
            urlinfo = gethtml.process_url(url)

            if urlinfo is None:
                return

            list, keyurl = urlinfo

            process_keyurl(keyurl)

            time.sleep(0.1)
Exemplo n.º 7
0
    def process_urllist(url_list):
        for url in url_list:
            #print url,"sub url"
            urlinfo = gethtml.process_url(url)

            if urlinfo is None:
                return

            list, keyurl = urlinfo

            process_keyurl(keyurl)

            time.sleep(0.1)
Exemplo n.º 8
0
    def process_keyurl(keyurl):
        if keyurl is not None:
            for key, urllist in keyurl.iteritems():
                for url in urllist:
                    urlinfo = gethtml.process_url(url)

                    if urlinfo is None:
                        continue

                    list, keyurl1 = urlinfo

                    if list is not None:
                        gethtml.collection.insert({'key':key, 'url':url, 'timetmp':time.time()})
Exemplo n.º 9
0
    def process_keyurl(keyurl):
        if keyurl is not None:
            for key, urllist in keyurl.iteritems():
                for url in urllist:
                    urlinfo = gethtml.process_url(url)

                    if urlinfo is None:
                        continue

                    list, keyurl1 = urlinfo

                    if list is not None:
                        gethtml.collection.insert({'key':key, 'url':url, 'timetmp':time.time()})

                    if keyurl1 is not None:
                        process_keyurl(keyurl1)
Exemplo n.º 10
0
    def process_urllist(url_list):
        for url in url_list:
            #print url,"sub url"
            urlinfo = gethtml.process_url(url)

            if urlinfo is None:
                continue

            list, keyurl = urlinfo

            if list is not None:
                process_urllist(list)

            if keyurl is not None:
                process_keyurl(keyurl)

            time.sleep(0.1)
Exemplo n.º 11
0
    def process_urllist(url_list):
        for url in url_list:
            #print url
            if url[len(url) - 1] == '/':
                url = url[0:-1]
                #print url

            #print url

            urlinfo = gethtml.process_url(url)

            if urlinfo is None or isinstance(urlinfo, str):
                continue

            list, keyurl = urlinfo

            if list is not None:
                process_urllist(list)

            if keyurl is not None:
                process_keyurl(keyurl)

            time.sleep(0.1)
Exemplo n.º 12
0
    def process_urllist(url_list):
        for url in url_list:
            #print url
            if url[len(url) - 1] == '/':
                url = url[0:-1]
                #print url

            #print url

            urlinfo = gethtml.process_url(url)

            if urlinfo is None or isinstance(urlinfo, str):
                continue

            list, keyurl = urlinfo

            if list is not None:
                process_urllist(list)

            if keyurl is not None:
                process_keyurl(keyurl)

            time.sleep(0.1)
Exemplo n.º 13
0
def seach(urllist):
    gethtml.process_url_list = []

    def process_keyurl(keyurl):
        if keyurl is not None:
            for key1, urllist in keyurl.iteritems():
                for url in urllist:
                    #print url
                    if url[len(url) - 1] == '/':
                        url = url[0:-1]
                        #print url

                    #print url

                    urlinfo = gethtml.process_url(url)

                    if urlinfo is None:
                        continue

                    if isinstance(urlinfo,
                                  str) and urlinfo == "url is be processed":
                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({
                            'key': key,
                            'url': url
                        }, {
                            '$set': {
                                'key': key,
                                'url': url,
                                'timetmp': time.time()
                            }
                        }, True)
                    else:
                        list, keyurl1 = urlinfo

                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({
                            'key': key,
                            'url': url
                        }, {
                            '$set': {
                                'key': key,
                                'url': url,
                                'timetmp': time.time()
                            }
                        }, True)

                        if keyurl1 is not None:
                            process_keyurl(keyurl1)

    def process_urllist(url_list):
        for url in url_list:
            #print url
            if url[len(url) - 1] == '/':
                url = url[0:-1]
                #print url

            #print url

            urlinfo = gethtml.process_url(url)

            if urlinfo is None or isinstance(urlinfo, str):
                continue

            list, keyurl = urlinfo

            if list is not None:
                process_urllist(list)

            if keyurl is not None:
                process_keyurl(keyurl)

            time.sleep(0.1)

    for url in urllist:
        print url, "root url"

        urlinfo = gethtml.process_url(url)

        if urlinfo is None or isinstance(urlinfo, str):
            print "error root url", url
            continue

        list, keyurl = urlinfo

        #print list
        #print keyurl

        try:
            process_urllist(list)
            process_keyurl(keyurl)

        except:
            import traceback
            traceback.print_exc()
Exemplo n.º 14
0
def seach(urllist):
    gethtml.process_url_list = []

    def process_keyurl(keyurl):
        if keyurl is not None:
            for key1, urllist in keyurl.iteritems():
                for url in urllist:
                    #print url
                    if url[len(url) - 1] == '/':
                        url = url[0:-1]
                        #print url

                    #print url

                    urlinfo = gethtml.process_url(url)

                    if urlinfo is None:
                        continue

                    if isinstance(urlinfo, str) and urlinfo == "url is be processed":
                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True)
                    else:
                        list, keyurl1 = urlinfo

                        key = ""
                        for c in key1:
                            if c >= 'A' and c <= 'Z':
                                c = c.lower()
                            key += c
                        gethtml.collection.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time()}}, True)

                        if keyurl1 is not None:
                            process_keyurl(keyurl1)

    def process_urllist(url_list):
        for url in url_list:
            #print url
            if url[len(url) - 1] == '/':
                url = url[0:-1]
                #print url

            #print url

            urlinfo = gethtml.process_url(url)

            if urlinfo is None or isinstance(urlinfo, str):
                continue

            list, keyurl = urlinfo

            if list is not None:
                process_urllist(list)

            if keyurl is not None:
                process_keyurl(keyurl)

            time.sleep(0.1)

    for url in urllist:
        print url, "root url"

        urlinfo = gethtml.process_url(url)

        if urlinfo is None or isinstance(urlinfo, str):
            print "error root url",url
            continue

        list, keyurl = urlinfo

        #print list
        #print keyurl

        try:
            process_urllist(list)
            process_keyurl(keyurl)

        except:
            import traceback
            traceback.print_exc()