Exemplo n.º 1
0
    def find_page1(keys):
        def update_pagelist(it, page_list):
            url = i['url']
            
            for page1 in page_list:
                if url == page1['url']:
                    page1['weight'] += 100
                    return 

            page = {}
            page['url'] = url
            page['weight'] = 0
            if i.has_key('weight'):
                page['weight'] = i['weight']
            page["userchose"] = 0
            if i.has_key('userchose'):
                page["userchose"] = i['userchose']

            page_list.append(page)

        key1 = []
        for k in keys:
            k = doclex.tolower(k)
            if k not in key1:
                key1.append(k)
        key = key1

        page_list = []
        for k in keys:
            c = collection_url_index.find({"key":k})
            for i in c:
                update_pagelist(i, page_list)

        return page_list
Exemplo n.º 2
0
    def find_page1(keys):
        def update_pagelist(it, page_list):
            url = i['url']

            for page1 in page_list:
                if url == page1['url']:
                    page1['weight'] += 100
                    return

            page = {}
            page['url'] = url
            page['weight'] = 0
            if i.has_key('weight'):
                page['weight'] = i['weight']
            page["userchose"] = 0
            if i.has_key('userchose'):
                page["userchose"] = i['userchose']

            page_list.append(page)

        key1 = []
        for k in keys:
            k = doclex.tolower(k)
            if k not in key1:
                key1.append(k)
        key = key1

        page_list = []
        for k in keys:
            c = collection_url_index.find({"key": k})
            for i in c:
                update_pagelist(i, page_list)

        return page_list
Exemplo n.º 3
0
def process_url(urlinfo):
    url = urlinfo['url']

    if url in processed_url_list:
        return "url is be processed"

    processed_url_list.append(url)

    print url,"process url"

    info = get_page(url)
    if info is None:
        print "url, error"
        return

    data, headers = info

    try:
        htmlp = htmlprocess(urlinfo)
        htmlp.feed(data)

        urlinfo = htmlp.urlinfo

        encodingdate = chardet.detect(headers['date'])
        date = unicode(headers['date'], encodingdate['encoding'])

        title = urlinfo['title']
        if title == "":
            if len(urlinfo['titlegen']) > 0:
                title = urlinfo['titlegen']

        profile = urlinfo['profile']['0']
        if profile == "":
            profile = urlinfo['profile']['1']
        if profile == "":
            if len(urlinfo['profile']['2']) > 0:
                profile = urlinfo['profile']['2'][0]

        if title != "" and profile != "":
            encodingdate = chardet.detect(title)
            title = unicode(title, encodingdate['encoding'])
            encodingdate = chardet.detect(profile)
            profile = unicode(profile, encodingdate['encoding'])
            collection_url_profile.update({'key':url} , {'key':url, 'urlprofile':profile.encode('utf-8'), 'timetmp':time.time(), 'date':date, 'title':title.encode('utf-8')}, True)

        for w in ['1', '2', '3']:
            keywords = urlinfo['keys'][w]

            for key1 in keywords:
                key = doclex.tolower(key1)
                collection_url_index.update({'key':key, 'url':url}, {'$set':{'key':key, 'url':url, 'timetmp':time.time(), 'weight':int(w), 'userchose':0}}, True)

        urlinfolist = htmlp.urllist
        for key, info in urlinfolist.iteritems():
            process_url(info)
            urlinfolist[key] = {}

    except:
        import traceback
        traceback.print_exc()
Exemplo n.º 4
0
def process_url(urlinfo):
    url = urlinfo['url']

    if url[len(url) - 1] == '/':
        url = url[0:len(url) - 1]
        urlinfo['url'] = url

    if url in processed_url_list:
        return "url is be processed"

    processed_url_list.append(url)

    print url,"process url"

    info = get_page(url)
    if info is None:
        print "url, error"
        return

    urllist.processurl(url)

    data, headers = info

    try:
        htmlp = htmlprocess(urlinfo)
        htmlp.feed(data)

        try:
            urlinfo = htmlp.urlinfo

            encodingdate = chardet.detect(headers['date'])
            date = unicode(headers['date'], encodingdate['encoding'])

            title = infosort.gettitle(urlinfo['title'])
            if len(title) > 16:
                title = title[0:16] + u'...'
            profile = infosort.getprofile(urlinfo['profile'])
            if title != u"" and profile != u"":
                print "update url", url
                collection_url_profile.update({'key':url} ,
                                              {'$set':{'key':url, 'urlprofile':profile.encode('utf-8', 'ignore'), 'timetmp':time.time(), 'date':date.encode('utf-8', 'ignore'), 'title':title.encode('utf-8', 'ignore')}},
                                              True)

                updatekeywords = []
                weight1 = []
                for key in urlinfo['keys']['1']:
                    if not doclex.inviald_key(key):
                        key = doclex.tolower(key)
                        key = key.encode('utf-8', 'ignore')
                        if key not in weight1:
                            weight1.append(key)
                for key in weight1:
                    if key not in updatekeywords:
                        updatekeywords.append(key)
                        collection_url_index.update({'key':key, 'url':url},
                                                    {'$set':{'url':url, 'key':key, 'weight':htmlp.weight+500}},
                                                    True)

                weight2 = []
                for key in urlinfo['keys']['2']:
                    if not doclex.inviald_key(key):
                        key = doclex.tolower(key)
                        key = key.encode('utf-8', 'ignore')
                        if key not in weight2:
                            weight2.append(key)
                for key in weight2:
                    if key not in updatekeywords:
                        updatekeywords.append(key)
                        collection_url_index.update({'key':key, 'url':url},
                                                    {'$set':{'url':url, 'key':key, 'weight':htmlp.weight+300}},
                                                    True)

                weight3 = []
                for key in urlinfo['keys']['3']:
                    if not doclex.inviald_key(key):
                        key = doclex.tolower(key)
                        key = key.encode('utf-8', 'ignore')
                        if key not in weight3:
                            weight3.append(key)
                for key in weight3:
                    if key not in updatekeywords:
                        updatekeywords.append(key)
                        collection_url_index.update({'key':key, 'url':url},
                                                    {'$set':{'url':url, 'key':key, 'weight':htmlp.weight}},
                                                    True)

        except:
            import traceback
            traceback.print_exc()

        urlinfolist = htmlp.urllist
        return urlinfolist

    except:
        import traceback
        traceback.print_exc()
Exemplo n.º 5
0
def process_url(urlinfo):
    url = urlinfo['url']

    if url[len(url) - 1] == '/':
        url = url[0:len(url) - 1]
        urlinfo['url'] = url

    if url in processed_url_list:
        return "url is be processed"

    processed_url_list.append(url)

    print url, "process url"

    info = get_page(url)
    if info is None:
        print "url, error"
        return

    urllist.processurl(url)

    data, headers = info

    try:
        htmlp = htmlprocess(urlinfo)
        htmlp.feed(data)

        try:
            urlinfo = htmlp.urlinfo

            encodingdate = chardet.detect(headers['date'])
            date = unicode(headers['date'], encodingdate['encoding'])

            title = infosort.gettitle(urlinfo['title'])
            if len(title) > 16:
                title = title[0:16] + u'...'
            profile = infosort.getprofile(urlinfo['profile'])
            if title != u"" and profile != u"":
                print "update url", url
                collection_url_profile.update({'key': url}, {
                    '$set': {
                        'key': url,
                        'urlprofile': profile.encode('utf-8', 'ignore'),
                        'timetmp': time.time(),
                        'date': date.encode('utf-8', 'ignore'),
                        'title': title.encode('utf-8', 'ignore')
                    }
                }, True)

                updatekeywords = []
                weight1 = []
                for key in urlinfo['keys']['1']:
                    if not doclex.inviald_key(key):
                        key = doclex.tolower(key)
                        key = key.encode('utf-8', 'ignore')
                        if key not in weight1:
                            weight1.append(key)
                for key in weight1:
                    if key not in updatekeywords:
                        updatekeywords.append(key)
                        collection_url_index.update({
                            'key': key,
                            'url': url
                        }, {
                            '$set': {
                                'url': url,
                                'key': key,
                                'weight': htmlp.weight + 500
                            }
                        }, True)

                weight2 = []
                for key in urlinfo['keys']['2']:
                    if not doclex.inviald_key(key):
                        key = doclex.tolower(key)
                        key = key.encode('utf-8', 'ignore')
                        if key not in weight2:
                            weight2.append(key)
                for key in weight2:
                    if key not in updatekeywords:
                        updatekeywords.append(key)
                        collection_url_index.update({
                            'key': key,
                            'url': url
                        }, {
                            '$set': {
                                'url': url,
                                'key': key,
                                'weight': htmlp.weight + 300
                            }
                        }, True)

                weight3 = []
                for key in urlinfo['keys']['3']:
                    if not doclex.inviald_key(key):
                        key = doclex.tolower(key)
                        key = key.encode('utf-8', 'ignore')
                        if key not in weight3:
                            weight3.append(key)
                for key in weight3:
                    if key not in updatekeywords:
                        updatekeywords.append(key)
                        collection_url_index.update({
                            'key': key,
                            'url': url
                        }, {
                            '$set': {
                                'url': url,
                                'key': key,
                                'weight': htmlp.weight
                            }
                        }, True)

        except:
            import traceback
            traceback.print_exc()

        urlinfolist = htmlp.urllist
        return urlinfolist

    except:
        import traceback
        traceback.print_exc()
Exemplo n.º 6
0
def find_page(input, index):
    keys = doclex.splityspace(input)
    for k in keys:
        collection_key.insert({"key":k})

    #if cache_page.has_key(input):
    #    cache_page['input']['timetmp'] = time.time()
    #    return cache_page['input']['urllist']

    page_list = []
    for k in keys:
            k = doclex.tolower(k)
        #if key_page.has_key(k):
            #key_page['input']['timetmp'] = time.time()
        #    page_list = key_page['input']['urllist']
        #else:
            c = collection_page.find({"key":k}).limit(10).skip(index*10)
            for i in c:
                page = {}
                page['url'] = i['url']
                page['timetmp'] = i['timetmp']
                page['key'] = i['key']
                page_list.append(page)

            remove_list = []

            for url in page_list:
                c = gethtml.collection_url_profile.find({'key':url['url']})
                if c.count() <= 0:
                    remove_list.append(url)
                    continue
                for i in c:
                    url["profile"] = i['urlprofile']
                    url["date"] = i['date']
                    url["title"] = i['title']

            for url in remove_list:
                page_list.remove(url)

            #key_page['input'] = {}
            #key_page['input']['timetmp'] = time.time()
            #key_page['input']['urllist'] = page_list

    if len(page_list) == 0:
        keys = doclex.lex(input.encode('utf-8'))
        for k in keys:
                k = doclex.tolower(k)
            #if key_page.has_key(k):
            #    key_page['input']['timetmp'] = time.time()
            #    page_list = key_page['input']['urllist']
            #else:
                c = collection_page.find({"key":k}).limit(10).skip(index*10)
                for i in c:
                    page = {}
                    page['url'] = i['url']
                    page['timetmp'] = i['timetmp']
                    page['key'] = i['key']
                    page_list.append(page)

                    remove_list = []

                for url in page_list:
                    c = gethtml.collection_url_profile.find({'key':url['url']})
                    if c.count() <= 0:
                        remove_list.append(url)
                        continue
                    for i in c:
                        url["profile"] = i['urlprofile']
                        url["date"] = i['date']
                        url["title"] = i['title']

                for url in remove_list:
                    page_list.remove(url)

                #key_page['input'] = {}
                #key_page['input']['timetmp'] = time.time()
                #key_page['input']['urllist'] = page_list

    #cache_page['input'] = {}
    #cache_page['input']['timetmp'] = time.time()
    #cache_page['input']['urllist'] = page_list

    page_list = page_list[0: 10]

    return page_list