def get_project(name):
    project_page = etree.HTML(get_page('https://www.openhub.net/p/' + name).decode('utf-8'))

    project_name = project_page.xpath(u"//*[@id=\"project_header\"]/div[1]/h1/a")[0].text
    project_tag = project_page.xpath(u"//*[@id=\"project_tags\"]/p")[0].text

    similar_projects = project_page.xpath(u"//*[@id=\"similar_projects\"]")[0].text

    manager = project_page.xpath(u"//*[@id=\"page_contents\"]/div[3]/div[2]/div/dl/dd[5]/a")[0].text

    licenses = project_page.xpath(u"//*[@id=\"page_contents\"]/div[3]/div[2]/div/dl/dd[3]")[0].text

    location_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/enlistments').decode('utf-8'))

    location_table = location_page.xpath(u"//table//tbody")[0]
    locations = [c.getchildren()[0].text.strip() for c in location_table.getchildren()]
    code_location = '\t'.join(locations)
    project = {"update_time": datetime.datetime.now().isoformat(), "project_name": project_name, "project_tag": project_tag, "similar_projects": similar_projects, "manager": manager, "licenses": licenses, "code_location": code_location }
    for key in project:
        if project[key] is None:
            project[key] = ''
    return project


# //*[@id="analyses_language_table"]
Пример #2
0
    def _task_list_handle(self, request, template):
        '''所有任务列表 通用处理'''
        dicts = self._get_role(request)
        if not dicts.has_key('is_operator'):
            return request, get_template(template), None

        name    = request.GET.get('name')
        email   = request.GET.get('email')
        duty    = request.GET.get('duty')
        result  = request.GET.get('result')

        als = Task.objects.filter(operator = request.user, uuid__category=self.ApplicationModel.__name__)
      
        if name:
            als = als.filter(uuid__name__contains = name)
        if email:
            als = als.filter(uuid__email__contains = email)
        if duty:
            als = als.filter(operator_duty = duty)
        if result:
            als = als.filter(result = result)

        als = als.order_by('-allot_time')
        dicts['p'] = get_page(als, request )
        dicts['result_list'] = dict(Task.result_choices)
        dicts['duty_list'] = dict(HelpDeskUser.duty_choices)
        return request, get_template(template), dicts
Пример #3
0
def ajax_list(request):
    '通过ajax的方式请求和返回新闻列表'
    if request.session['stunum']:  # 已经登录
        print request.session['stunum']
        news = News.objects.all()
        if 'date' in request.GET:
            date = request.GET['date']
            startDate = date.split('--')[0]
            endDate = date.split('--')[1]
            # print startDate, endDate
            news = news.filter(rel_time__gte=startDate)  # 大于等于开始日期
            news = news.filter(rel_time__lte=endDate)  # 小于等于结束日期
        # 加len(news) >0 的原因是,如果之前过滤后就没有数据了,其他条件就不用过滤了
        if len(news) > 0 and 'key_word' in request.GET:
            keyword = request.GET['key_word']
            news = news.filter(title__contains=keyword)
        if len(news) > 0 and 'department' in request.GET:
            department = request.GET['department']
            news = news.filter(section=department)
        if len(news) > 0 and 'zhuanti' in request.GET:
            zhuanti = request.GET['zhuanti']
            news = utils.handle_zhuanti(zhuanti, news)
        reDict = utils.get_page(news, 10, request.GET['page'])
        # reDict['data_list'] = json.dumps(reDict['data_list'])
        # print reDict#['data_list']
        return HttpResponse(json.dumps(reDict), content_type='application/json')
    else:
        return HttpResponse(u'error')
Пример #4
0
def findPerson(query):
    """
    returns the name that shows up the most from the google search of the query
    arguments:
      string of the question
    return:
      name of a person
    """   
    file = open("words.txt")
    words = file.read()
    l = utils.search(query)
    goodWords=[]
    exp = "[A-Z][a-z][a-z]+ [A-Z][a-z]+"  
        
    for pages in l:
        text = re.sub("[\t\n ]", " ", utils.get_page(pages))
        result = re.findall(exp, text)
        for x in result:
            z = x.split(" ")
            if z[0].lower() not in words and z[1].lower() not in words:
                goodWords.append(x)
    wordcounts={}
    for word in goodWords:
        if wordcounts.has_key(word):
            wordcounts[word]+=1
        else:
            wordcounts[word]=1
    person = wordcounts.keys()[0]
    for word in wordcounts:
        if wordcounts[word] > wordcounts[person]:
            person = word
    return person
Пример #5
0
    def POST(self, favorite_id):
        """
        save changes.
        """

        post_sent = web.input()
        #post_sent['date'] = datetime.datetime.now() # umcomment if you want to update the date!
        
        old_favorite = db.load_fav_by_id(favorite_id) # load it again
        
        flag_get_title = False
        flag_save_page = False
        
        # update post_sent with old page_path
        # if save_page is True it will be overwritten to new page_path
        # otherwise old value used.
        post_sent['page_path'] = old_favorite['page_path']
        
        # checkboxes
        if post_sent.has_key('get_title'):
            flag_get_title = True
        if post_sent.has_key('save_page'):
            db.delete_saved_page(favorite_id) # remove previous page
            flag_save_page = True
        # if any of two flags is True -> call utils.get_page
        if flag_get_title or flag_save_page:
            post_sent = utils.get_page(post_sent, flag_save_page, flag_get_title)

        db.update_favorite(favorite_id, post_sent) # update
        raise web.seeother('/')# go home
Пример #6
0
    def history(self, request, template):
        '''用户申请历史记录'''
        dicts = self._get_role(request)
        if not dicts.has_key('is_applicant'): 
            return HttpResponseRedirect(self._get_login_url(request))

        name    = request.GET.get('name')
        email   = request.GET.get('email')
        status  = request.GET.get('status')
        page    = request.GET.get('page')
        user    = HelpDeskUser.objects.filter(user = request.user, role='0')
        user    = user[0].user
        # dep     = [u.department for u in user]
        # apps    = self.ApplicationModel.objects.filter(department__in = dep).order_by('-apply_time')

        # only login user 
        apps = self.ApplicationModel.objects.filter(submit_user = user).order_by('-apply_time')

        if name:
            apps = apps.filter(name__contains = name)
        if email:
            apps = apps.filter(email__contains = email)
        if status:
            apps = apps.filter(status = status)

        p = get_page(apps, request)
        
        dicts['p'] = p
        dicts['statuslist'] = dict(self.ApplicationModel.status_choices)
        return render_to_response(request, get_template(template), dicts)
Пример #7
0
    def list(self, request, template):
        """
            在原来的基础上添加了文件上传和xls导入,添加了分页后数据处理的接口
        """
        u = request.user
        ls = self._get_list(request)
       
        #重写_get_list的时候返回的ls 如果是None会报错,强制转换一下增加可用性
        if ls == None:
            ls = self.DefaultModel.objects.none()

        args = {}
        for ak in self.list_args.keys():
            if re.search('_doption$', ak):
                if request.GET.get(ak , None):
                    datestr = (request.GET.get(ak, None)).split('-')
                    args[str(self.list_args.get(ak))] = datetime.strptime((''.join((datestr[0],'-',datestr[1],'-01'))), '%Y-%m-%d')
            elif re.search('_option$', self.list_args.get(ak)):
                if request.GET.get(ak, None) and request.GET.get(ak + '_option', None):
                    args[str(ak+'__'+request.GET.get(ak + '_option', None))] = str(request.GET.get(ak, None))
#            elif re.search('_extra$', self.list_args.get(ak)):
#                if request.GET.get(ak, None):
#                    ls = self._extra_filter(request, ls, ak,self.list_args[ak])
            else:
                if request.GET.get(ak, None):
                    try:
                        args[str(self.list_args.get(ak))] = str(request.GET.get(ak, None))
                    except UnicodeEncodeError:
                        args[str(self.list_args.get(ak))] = request.GET.get(ak, None)
                
        ls = ls.filter(**args)
        ls = self._extra_filter(request,ls)
        
        
        if(request.GET.get('excel')):
            if request.method == "POST":
                cols = request.POST.getlist("cols")
                return self.csv_export(request, ls, cols)
            
        try:
            p = get_page(ls, request)
        except EmptyPage:
            return HttpResponseRedirect('./')

        c_list = []
        if self.csv_columns:
            for c in self.csv_columns:
                c_list.append(c[0].decode("utf-8"));
                
        p = self._deal_page_data(request,p)
        list_dicts = {'p':p, 'excel_cs':c_list}
        list_dicts.update(self._get_list_dicts(request))
        
        if(request.GET.get('upload')):
            if request.method == "POST":
                return self.upload(request, template, list_dicts)
            
        
        return render_to_response(request, template, list_dicts )   
def download_episode(number):
    try:
        download_episode(get_episodes()[(number.real)-1])
    except AttributeError: #no .real => is _not_ an integer
        page = utils.get_page("http://musicforprogramming.net/?{0}".format("c="+number if not number.startswith("c") else number))
        url, songname = re.findall(r"(http:\/\/datashat\.net\/(music_for_programming_.+\.mp3))\"", page)[0]
        print(url, songname)
        utils.store(url, songname, overwrite=False)
def imguralbum(url, opt_store=True):
    html = utils.get_page(url)
    names = []
    for s in re.findall(r"<a.+?class=\"zoom\".+?href=\"(.+?)\">", html):
        r = re.search(r"([^/]+?)(.png|.jpg|.jpeg)$", s)
        if opt_store: utils.store("https:" + s, r.group(1) + r.group(2))
        names.append(r.group(1) + r.group(2))
    return names
Пример #10
0
def last_comic(download=True, return_number=False):
    text = utils.get_page('http://megatokyo.com')
    #print (strip_image(text))
    strip_number = int(strip_image(text)[0][:-4]) #removing .png || .gif
    if download:
        dump_single(strip_number, image_format=strip_image(text)[0][-4:])
    if return_number:
        return strip_number
Пример #11
0
def dump_single(number, image_format=None):
    if not image_format:
        text = utils.get_page('http://megatokyo.com/strip/{0}'.format(number)) #retrieving the image format
        #print (strip_image(text))
        strip_name = strip_image(text)[0]
    else:
        strip_name = str(number)+image_format
    utils.store('http://megatokyo.com/strips/{0}'.format(strip_name), strip_name, overwrite=False)
Пример #12
0
def parse_category_by_type(category, subcategory, link, project_index, type = 'popular'):
    if subcategory:
        message = 'Parse subcategory "{0}" of "{1}"'.format(subcategory, category)
    else:
        message = 'Parse category "{0}"'.format(category)

    print message

    projects = []
    stop = False
    page_count = 1
    while not stop and (page_count <= MAX_PAGE_PARSE or MAX_PAGE_PARSE < 0):
        page = get_page('{0}{2}/?page={1}'.format(link, page_count, type))
        page_count+=1
        project_blocks = page.cssselect('.project')
        stop = len(project_blocks) == 0
        for block in project_blocks:
            try:
                location = block.cssselect('.location-name')[0].text.strip()
            except Exception:
                location = ''

            project = {
                'category': category,
                'subcategory': subcategory,
                'name': block.cssselect('.project-card > h2 > strong > a')[0].text.strip(),
                'description': block.cssselect('.project-card > p')[0].text.strip(),
                'location': location,
                'founder': block.cssselect('.project-card > h2 > span')[0].text.strip()[3:],
                'funded': None,
                'funded_date': None,
                'pledged': None,
                'days left': None,
                }
            stats = block.cssselect('.project-stats > li')
            for stat in stats:
                stat_name = ''.join(stat.xpath("text()")).strip()
                if stat_name in {'funded', 'pledged'}:
                    value = stat.cssselect('strong')[0].text.replace('%', '').replace('$', '').replace(',', '').strip()
                    project[stat_name] = float(value)
                elif stat_name == 'days left':
                    value = stat.cssselect('.num')[0].text.strip()
                    project[stat_name] = int(value)
                elif stat_name in ['hours left', 'hour left', 'min left', 'mins left']:
                    project['days left'] = 0
                else:
                    value = stat_name
                    project['days left'] = -1
                    project['funded_date'] = str(datetime.datetime.strptime(value, '%b %d, %Y'))

            h = hashit(project)

            if h not in project_index:
                project_index.add(h)
                projects.append(project)

    print '{0}. Ended!!'.format(message)
    return projects
def get_project_news(name):
    project_page = etree.HTML(get_page('https://www.openhub.net/p/' + name).decode('utf-8'))

    project_age = project_page.xpath(u"//*[@id=\"factoids\"]/li[3]/div/span[1]/a")[0].text.strip()

    team_size = project_page.xpath(u"//*[@id=\"factoids\"]/li[1]/div/a[2]")[0].text.strip()

    project_activity = project_page.xpath(u"//*[@id=\"project_header_activity_indicator\"]/div")[0].text.strip()

    factoids_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/factoids').decode('utf-8'))
    comments = ''.join(factoids_page.xpath(u"//*[@id=\"page_contents\"]")[0].itertext()).replace(u'\xa0', '').strip()

    # team_size_per_month = project_page.xpath(u"//*[@id=\"factoids\"]/li[3]/div/span[2]/a")[0].text
    # print(team_size_per_month)

    # contributor = project_page.xpath(u"")[0].text
    # print(contributor)

    ratings_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/reviews/summary').decode('utf-8'))
    community_score = ratings_page.xpath(u"//*[@id=\"average_rating_details_2\"]")[0].text.replace(u'\xa0', '').strip()

    cost_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/estimated_cost').decode('utf-8'))
    costs =  [''.join(i.itertext()).strip().replace(',', '').split('\n') for i in cost_page.xpath('.//div[@class="controls"]')][1:]
    lines =  [i.attrib['value'] for i in cost_page.xpath('.//option')]
    codebase_size = int(costs[0][0])
    estimated_effort = int(costs[1][0])
    estimated_cost = int(costs[2][1])

    cocomo = { 'codebase_size': codebase_size, 'estimated_effort': estimated_effort, 'estimated_cost': estimated_cost, "all_code": lines[0], 'logic_code_only': lines[1], 'markup_only': lines[2], 'build_scripts_only': lines[3] }


    language_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/analyses/latest/languages_summary').decode('utf-8'))

    languages_table = language_page.xpath(u"//*[@id=\"analyses_language_table\"]")[0]
    data = [x for c in languages_table.getchildren() for x in c.getchildren()][2:-2]
    data = [[''.join(j.itertext()).strip() for j in i.getchildren()][1:] for i in data]

    languages = [{"code_name": line[0], "code_lines": line[1], "comment_lines": line[2], "comment_ratio": line[3], "blank_lines" : line[4], "total_lines": line[5], "total_percentage" : line[6]} for line in data]


    project_news = {"update_time": datetime.datetime.now().isoformat(), 'team_size': team_size, 'project_age': project_age, 'activity': project_activity, 'comments': comments, 'languages': json.dumps(languages), 'cost': json.dumps(cocomo) }
    for key in project_news:
        if project_news[key] is None:
            project_news[key] = ''
    return project_news
Пример #14
0
    def _task_handle(self, request, template):
        '''待处理任务列表 通用处理'''
        dicts = self._get_role(request)
        if not dicts.has_key('is_operator'):
            return request, get_template(template), None

        al = Task.objects.filter(operator = request.user, result='0', uuid__category=self.ApplicationModel.__name__)
        dicts['p'] = get_page(al, request)
        return request, get_template(template), dicts
Пример #15
0
def search(request):
    from sphinxapi import SphinxClient, SPH_MATCH_EXTENDED, SPH_SORT_RELEVANCE
    term = request.GET.get('term', '')
    category = None
    args = [u'term=%s'%term]
    template_name = 'board/search.html'
    if term:
        sphinx = SphinxClient()
        sphinx.SetServer(settings.SPHINX_SERVER, settings.SPHINX_PORT)
        sphinx.SetMatchMode(SPH_MATCH_EXTENDED)
        sphinx.SetSortMode(SPH_SORT_RELEVANCE)
        cid = request.GET.get('c')
        if cid:
            try:
                cid = int(cid)
            except TypeError:
                raise Http404
            category = get_object_or_404(Category, cid)
            if category:
                sphinx.SetFilter('category_id', [category])
                args.append(u'c=%s'%cid)
        user_settings = get_user_settings(request.user)
        try:
            page = int(request.GET.get('page', '1'))
            if page < 1:
                raise Http404
        except ValueError:
            raise Http404
        #sphinx.SetLimits(page * user_settings.ppp, user_settings.ppp)
        if request.GET.get('adv_submit.x'):
            template_name='board/advanced_search.html'
            u = User.objects.filter(username=term)
            if u:
                q = QuerySetPaginator(Post.objects.filter(user=u),
                    user_settings.ppp)
            else:
                q = Paginator([], 1).page(1)
        else:
            result = sphinx.Query(u'@@relaxed %s'%term)
            if not result.has_key('total_found'):
                template_name = 'board/search_unavailable.html'
            pages = result.get('total_found', 0) / user_settings.ppp
            if pages > 0 and page > pages:
                raise Http404
            ids = [m['id'] for m in result.get('matches', [])]
            q = QuerySetPaginator(Post.view_manager.filter(id__in=ids),
                user_settings.ppp)
            q = get_page(request.GET.get('page', 1), q)
    else:
        q = Paginator([], 1).page(1)
    return render_to_response(template_name, {
        'result': q,
        'term': term,
        'category': category,
        'args': u'&'.join(['']+args),
    }, context_instance=RequestContext(request, processors=extra_processors))
Пример #16
0
def list_by_tag(request, name):
	'''按标签'''
	try:
		page = int(request.GET.get('page', '1'))
	except ValueError:
		page = 1

	return render_to_response('index.html', {
		'title': name + ' - Tag',
		'posts': utils.get_page(Post.objects.filter(tags__name=name), page),
	}, context_instance=RequestContext(request))
Пример #17
0
 def crawl_daili66(self, page_count=4):
     start_url = 'http://www.66ip.cn/{0}.html'
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         html = get_page(url)
         if html:
             selector = etree.HTML(html)
             trs = selector.xpath('.//div[@id="main"]/div[1]/div[1]/table/tr')
             for i in range(1, len(trs)):
                 ip = trs[i].xpath('.//td[1]/text()')[0]
                 port = trs[i].xpath('.//td[2]/text()')[0]
                 yield ':'.join([ip, port])
Пример #18
0
def home(request):
    """首页"""
    try:
        page = int(request.GET.get("page", "1"))
    except ValueError:
        page = 1

    return render_to_response(
        "index.html",
        {"index": True, "keywords": settings.SITE_DESC, "posts": utils.get_page(Post.objects.all(), page)},
        context_instance=RequestContext(request),
    )
Пример #19
0
    def list(self, request, template):
        """
        实现对象的列表与查询,该方法在子类中一般不需要重写
        """
        u = request.user
        ls = self._get_list(request)
        
        args = {}
        for ak in self.list_args.keys():
            if re.search('_doption$', ak):
                if request.GET.get(ak , None):
                    datestr = (request.GET.get(ak, None)).split('-')
                    args[str(self.list_args.get(ak))] = datetime.strptime((''.join((datestr[0],'-',datestr[1],'-01'))), '%Y-%m-%d')
            elif re.search('_option$', self.list_args.get(ak)):
                if request.GET.get(ak, None) and request.GET.get(ak + '_option', None):
                    args[str(ak+'__'+request.GET.get(ak + '_option', None))] = str(request.GET.get(ak, None))
            else:
                if request.GET.get(ak, None):
                    try:
                        args[str(self.list_args.get(ak))] = str(request.GET.get(ak, None))
                    except UnicodeEncodeError:
                        args[str(self.list_args.get(ak))] = request.GET.get(ak, None)
                     
        pri_contains = request.GET.get("pcontains", None)
        if pri_contains:
            ls = ls.model.objects_uc.filter_by_username_contains_pri(pri_contains, ls)
           
        pri = request.GET.get("p", None)
        if pri:    
            order = 0
            ls = ls.model.objects_uc.filter_by_jobnum_str(pri, ls, order)
            
        attstr = request.GET.get("a",None)
        if attstr: 
            order = 1           
            ls = ls.model.objects_uc.filter_by_jobnum_str(attstr, ls, order)
        print args        
        ls = ls.filter(**args)
                
        if(request.GET.get('excel')):
            if request.method == "POST":
                cols = request.POST.getlist("cols")
                return self.csv_export(request, ls, cols)

        p = get_page(ls, request)
        c_list = []
        if self.csv_columns:
            for c in self.csv_columns:
                c_list.append(c[0].decode("utf-8"));
        list_dicts = {'p':p, 'excel_cs':c_list}
        list_dicts.update(self._get_list_dicts(request))
        
        return render_to_response(request, template, list_dicts )   
Пример #20
0
    def get(self, PAGE_RE):
        username = self.get_username()
        login = True if username else False

        if login:
            v = self.request.get('v')
            if v and v.isdigit():
                p = utils.get_page(PAGE_RE, page_id=int(v))
            else:
                p = utils.get_page(PAGE_RE)

            if p is None:
                self.error(404)
            else:
                self.render('/templates/editpage.html',
                            login=login,
                            username=username,
                            page=p)

        else:
            self.redirect('/login')
Пример #21
0
def home(request):
	'''首页'''
	try:
		page = int(request.GET.get('page', '1'))
	except ValueError:
		page = 1

	return render_to_response('index.html', {
		'index': True,
		'keywords': settings.SITE_DESC,
		'posts': utils.get_page(Post.objects.all(), page),
	}, context_instance=RequestContext(request))
Пример #22
0
    def _allot_handle(self, request, template):
        '''待分配申请列表 通用处理'''
        dicts = self._get_role(request)
        if not dicts.has_key('is_alloter'):
            return request, get_template(template), None

        apps        = self.ApplicationModel.objects.filter(status__in=('0', '5')).order_by('-apply_time')
        undocount   = apps.count()
        
        dicts['p']          = get_page(apps, request)
        dicts['undocount']  = undocount
        return request, get_template(template), dicts
Пример #23
0
def thread(request, thread_id):
    try:
        thr = Thread.view_manager.get(pk=thread_id)
    except Thread.DoesNotExist:
        raise Http404

    if not thr.category.can_read(request.user):
        raise PermissionError

    render_dict = {}

    if request.user.is_authenticated():
        render_dict.update({"watched": WatchList.objects.filter(user=request.user, thread=thr).count() != 0})

    if request.POST:
        if not thr.category.can_post(request.user):
            raise PermissionError
        postform = PostForm(request.POST)
        if postform.is_valid():
            postobj = Post(thread = thr,
                    user = request.user,
                    text = postform.cleaned_data['post'],
                    )
            postobj.save() # this needs to happen before many-to-many private is assigned

            if len(postform.cleaned_data['private']) > 0:
                _log.debug('thread(): new post private = %s' % postform.cleaned_data['private'])
                postobj.private = postform.cleaned_data['private']
                postobj.is_private = True
                postobj.save()
            postobj.notify()
            return HttpResponseRedirect(reverse('board_locate_post',
                args=(postobj.id,)))
    else:
        postform = PostForm()

    # this must come after the post so new messages show up
    post_list = Post.view_manager.posts_for_thread(thread_id, request.user)
    user_settings = get_user_settings(request.user)
    if user_settings.reverse_posts:
        post_list = post_list.order_by('-odate')
    post_list = QuerySetPaginator(post_list, user_settings.ppp)

    render_dict.update({
            'result': get_page(request.GET.get('page', 1), post_list),
            'thr': thr,
            'postform': postform,
            'category': thr.category,
            })
    
    return render_to_response('board/thread.html',
            render_dict,
            context_instance=RequestContext(request, processors=extra_processors))
Пример #24
0
 def crawl_xicidaili(self, page_count=4):
     start_url = 'http://www.xicidaili.com/nn/{0}'
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         html = get_page(url)
         if html:
             selector = etree.HTML(html)
             trs = selector.xpath('.//table[@id="ip_list"]/tr')
             for i in range(1, len(trs)):
                 ip = trs[i].xpath('.//td[2]/text()')[0]
                 port = trs[i].xpath('.//td[3]/text()')[0]
                 yield ':'.join([ip, port])
Пример #25
0
def get_entry(id):
    
    contents = utils.get_page(show_entry_base_url%id)
    
    soup = BeautifulSoup(contents)
    
    
    for th in soup.find_all('th'):
        if th.has_attr('id') and th['id'] == "modelname":
            entry = Entry(id, th.string)
    
    return entry
Пример #26
0
 def crawl_ip3366(self, page_count=4):
     start_url = 'http://www.ip3366.net/free/?stype=1&page={0}'
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         html = get_page(url)
         if html:
             selector = etree.HTML(html)
             table = selector.xpath('.//div[@id="list"]/table')[0]
             trs = table.xpath('.//tr')
             for i in range(1, len(trs)):
                 ip = trs[i].xpath('.//td[1]/text()')[0]
                 port = trs[i].xpath('.//td[2]/text()')[0]
                 yield ':'.join([ip, port])
Пример #27
0
def index(request, page=1):
    '''首页 with page add'''
    # try:
    # page = int(request.GET.get('page', '1'))
    # except ValueError:
    #     page = 1
    page = int(page)

    return render_to_response('index.html', {
        'index': True,
        'keywords': settings.SITE_DESC,
        'posts': utils.get_page(Post.objects.all(), page),
    }, context_instance=RequestContext(request))
Пример #28
0
 def show_verify(self, request, template):
     if get_role(request) == 'admin' or get_role(request) == 'gra':
         list = self.DefaultModel.objects.filter(activate_state = '1').values("verify").annotate(count=Count('verify')).order_by("-count")
     else:
         c = get_perm_college_sch(request)
         if c:
             list = self.DefaultModel.objects.filter(activate_state = '1', eduinfo__apply_major__school_code = c.code).values("verify").annotate(count=Count('verify')).order_by("-count")
         else:
             list = None
     p = get_page(list, request)
     dicts = get_top_left_modules(request,5)
     dicts.update({"p": p})        
     return render_to_response(request, template, dicts)
Пример #29
0
    def get(self, PAGE_RE):
        v = self.request.get('v')
        if v and v.isdigit():
            p = utils.get_page(PAGE_RE, page_id=int(v))
        else:
            p = utils.get_page(PAGE_RE)

        username = self.get_username()
        login = True if username else False

        if p is None:
            if login:
                content = ''    # empty content
                p = utils.create_page(PAGE_RE, content)
                self.redirect('/_edit' + PAGE_RE)
            else:
                self.redirect('/login')
        else:
            self.render('/templates/wikipage.html',
                        login=login,
                        username=username,
                        page=p)
Пример #30
0
def parse_navigation():
    start_page = get_page(to_absolute_url('/discover/'))
    nav = dict()
    for el in start_page.cssselect('.navigation > li > a'): # get first level categories
        if el.text in EXCLUDE_NAVIGATION:
            continue

        link = to_absolute_url(el.attrib['href'])
        nav[el.text] = {
            'link': link,
            'children': dict(),
        }
        category_page = get_page(link)
        for sub_el in category_page.cssselect('.subnavigation > li > a'): # look for second level categories
            if sub_el.text in EXCLUDE_NAVIGATION:
                continue

            sub_category_link = to_absolute_url(sub_el.attrib['href'])
            nav[el.text]['children'][sub_el.text] = {
                'link': sub_category_link,
            }
    return nav
Пример #31
0
 def crawl_ip3366(self):
     for i in range(1, 4):
         start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
         html = get_page(start_url)
         if html:
             find_tr = re.compile('<tr>(.*?)</tr>', re.S)
             trs = find_tr.findall(html)
             for s in range(1, len(trs)):
                 find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                 re_ip_address = find_ip.findall(trs[s])
                 find_port = re.compile('<td>(\d+)</td>')
                 re_port = find_port.findall(trs[s])
                 for address, port in zip(re_ip_address, re_port):
                     address_port = address + ':' + port
                     yield address_port.replace(' ', '')
Пример #32
0
 def crawl_iphai(self):
     start_url = 'http://www.iphai.com/'
     html = get_page(start_url)
     if html:
         find_tr = re.compile('<tr>(.*?)</tr>', re.S)
         trs = find_tr.findall(html)
         for s in range(1, len(trs)):
             find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>',
                                  re.S)
             re_ip_address = find_ip.findall(trs[s])
             find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
             re_port = find_port.findall(trs[s])
             for address, port in zip(re_ip_address, re_port):
                 address_port = address + ':' + port
                 yield address_port.replace(' ', '')
Пример #33
0
    def crawl_daili66(self, page_count=4):
        start_url = "http://www.66ip.cn/{}.html"
        urls = [start_url.format(page) for page in range(1, page_count + 1)]

        for url in urls:
            html = get_page(url)

            if html:
                doc = pq(html)
                trs = doc(".containerbox table tr:gt(0)").items()

                for tr in trs:
                    ip = tr.find("td:nth-child(1)").text()
                    port = tr.find("td:nth-child(2)").text()
                    yield ":".join([ip, port])
Пример #34
0
 def menu(self):
     for i in range(1, 2):
         target_url = 'http://www.ygdy8.net/html/gndy/oumei/list_7_{}.html'.format(
             i)
         # 建立一个selector选择器,用于xpath过滤
         selector = build_xpath_tree(get_page(target_url, 'gb2312'))
         # 获取所有的子集页面的a标签
         items = selector.xpath('//div[@class="co_content8"]//tr//a[2]')
         # 提取a标签内的url和内容(电影名)
         for item in items:
             next_url = item.xpath('@href')[0]
             name = re.findall(u'《(.*)》', item.xpath('text()')[0])[0]
             print(name)
             self.get_info(next_url,
                           movie_info={'name': windows_name_format(name)})
Пример #35
0
 def crawl_proxy360(self):
     """
     获取Proxy360
     :return: 代理
     """
     start_url = 'http://www.proxy360.cn/Region/China'
     print('Crawling', start_url)
     html = get_page(start_url)
     if html:
         doc = pq(html)
         lines = doc('div[name="list_proxy_ip"]').items()
         for line in lines:
             ip = line.find('.tbBottomLine:nth-child(1)').text()
             port = line.find('.tbBottomLine:nth-child(2)').text()
             yield ':'.join([ip, port])
Пример #36
0
def resource_manage():
    query_data = request.args

    page = int(query_data.get('page', 1))
    size = int(query_data.get('size', current_app.config['ARTICLE_PER_PAGE']))

    if page <= 0 or size <= 0:
        abort(400)

    offset, limit = get_page(page, size)
    session = getSessionFactory().get_session()
    resources = session.query(Resources) \
                .order_by(Resources.id.desc()).offset(offset).limit(limit).all()
    resources_data = [resource.get_map_data() for resource in resources]
    return render_template('resourceUpload.html', resources = resources_data)
Пример #37
0
def scraper_main_gumtree(url):
    """ Reads pages with offers from GumTree and provides URLS to said offers. """

    # Loading the page
    page = get_page(url)

    # Putting the offer's URLs together
    offers = page.element("div[class='view'] div[class='title'] a")

    return {
        "url":
        url,
        "offers_urls":
        ["https://www.gumtree.pl" + off.attrib.get("href") for off in offers]
    }
Пример #38
0
    def crawl_kuaidaili(self, page_count=4):
        start_url = "http://www.kuaidaili.com/free/inha/{}"
        urls = [start_url.format(page) for page in range(1, page_count + 1)]

        for url in urls:
            html = get_page(url)

            if html:
                doc = pq(html)
                trs = doc("#list tbody tr").items()

                for tr in trs:
                    ip = tr.find("td:nth-child(1)").text()
                    port = tr.find("td:nth-child(2)").text()
                    yield ":".join([ip, port])
Пример #39
0
    def crawl_xicidaili(self, page_count=4):
        start_url = "http://www.xicidaili.com/nn/{}"
        urls = [start_url.format(page) for page in range(1, page_count + 1)]

        for url in urls:
            html = get_page(url)

            if html:
                doc = pq(html)
                trs = doc("#ip_list tr:gt(0)").items()

                for tr in trs:
                    ip = tr.find("td:nth-child(2)").text()
                    port = tr.find("td:nth-child(3)").text()
                    yield ":".join([ip, port])
Пример #40
0
    def _get_quiz_json(self, quiz_id, session_id):
        headers = self._auth_headers_with_json()
        data = {"contentRequestBody": {"argument": []}}

        reply = get_page(self._session,
                         POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE,
                         json=True,
                         post=True,
                         data=json.dumps(data),
                         headers=headers,
                         user_id=self._user_id,
                         class_name=self._course_name,
                         quiz_id=quiz_id,
                         session_id=session_id)
        return reply['contentResponseBody']['return']
Пример #41
0
def scrape():
    for url in remaining_urls():
        try:
            id = get_page_identifier(url)
            page = get_page(url)
            metadata_to_save = capture_metadata(page)
            save_captured_data(metadata_to_save, id)
            print(
                'Saved volume {}, issue {}, page {} - {:.2f}% finished'.format(
                    id['volume'], id['issue'], id['page'], 100 *
                    len(os.listdir('Metadata')) / len(os.listdir('PNGs'))))
        except Exception as error:
            print('Failure at volume {}, issue {}, page {}: {}'.format(
                id['volume'], id['issue'], id['page'], error))
            print('Skipping for now')
Пример #42
0
 def crawl_goubanjia(self):
     start_url = 'http://www.goubanjia.com/free/gngn/index{page}.shtml'
     for page in range(1, 11):
         html = get_page(start_url.format(page=page))
         if html:
             doc = pq(html.text)
             tds = doc('td.ip').items()
             for td in tds:
                 td.find('p').remove()
                 port_code = td.find('.port').attr['class'].replace(
                     'port ', '')
                 port = proxy_port(port_code)
                 td.find('.port').remove()
                 ip = td.text().replace(' ', '')
                 proxy = ip + port
                 yield proxy
Пример #43
0
def show_tag(request, tagname):
    '''按标签查看'''
    try:
        page = int(request.GET.get('page', '1'))
    except ValueError:
        page = 1
    c = {
        'settings': global_settings,
        'title': tagname + ' - Tag',
        'no_sidebar': False,
        'posts': utils.get_page(Post.objects.filter(tags__name=tagname), page),
    }
    c.update(common_response(request))
    return render_to_response('%s/index.html' % THEME,
                              c,
                              context_instance=RequestContext(request))
Пример #44
0
    def get_danmu(cls, cid):
        """获取视频对应的弹幕信息"""
        danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so?oid={}'.format(
            cid)

        res = get_page(danmu_url)
        danmu = []
        try:
            response = res.content.decode('utf-8')
            selector = Selector(text=response)
            for d in selector.css('d'):
                txt = d.css('::text').extract_first()
                danmu.append({'txt': txt})
        except Exception as e:
            print(e)
        return danmu
Пример #45
0
 def crawl_daili66(self, page_count=5):
     """
     代理名称:66ip
     :param page_count: 页码
     :return: proxy
     """
     start_url = 'http://www.66ip.cn/{}.html'
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         r = get_page(url)
         html = etree.HTML(r)
         proxy_trs = html.xpath("//div[@id='main']//table//tr")
         for i in range(1, len(proxy_trs)):  # 第一行是标题,略过
             ip = proxy_trs[i].xpath("./td[1]")[0].text
             port = proxy_trs[i].xpath("./td[2]")[0].text
             yield ":".join([ip, port])
Пример #46
0
	def IP3366Crawler(self):
		"""
		云代理
		parse html file to get proxies
		:return:
		"""
		start_url = 'http://www.ip3366.net/free/?stype=1&page={page}'
		urls = [start_url.format(page=i) for i in range(1, 8)]
		# \s * 匹配空格,起到换行作用
		ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
		for url in urls:
			html = get_page(url)
			re_ip_address = ip_address.findall(str(html))
			for adress, port in re_ip_address:
				result = f"{adress}:{port}"
				yield result.strip()
Пример #47
0
 def crawl_goubanjia(self):
     """
     获取Goubanjia
     :return: 代理
     """
     start_url = 'http://www.goubanjia.com/'
     html = get_page(start_url)
     # print(html)
     if html:
         doc = pq(html)
         tds = doc('.table.table-hover tbody tr').items()
         for td in tds:
             td.find('p').remove()  # 去掉p节点,因为p节点重复了
             ip_class = td.find('.ip')
             ip = ip_class.text().replace('\n', '')
             yield ip
Пример #48
0
def summary(request, *args, **kwargs):
    if not request.session.get(
            'is_login', False) or request.session['user_type'] != 'tender':
        return redirect("/index")
    error_msg = ''
    page = utils.get_page(args)  # 获取当前页面
    _projects_dict = utils.get_all_projects()
    page_list = utils.get_pages(page, _projects_dict)
    projects_dict, page, project_id = utils.adjust_info(
        _projects_dict, page, 1)
    if request.method == 'GET':
        return render(request, 'tender_projectsum.html', {
            'projects_dict': projects_dict,
            "page_list": page_list,
            "page": page
        })
Пример #49
0
 def craw_iphai(self):
     start_url = 'http://www.iphai.com/'
     html = get_page(start_url)
     if html:
         #搞不懂为他为什么做这么复杂,可以直接用正则匹配
         find_tr = re.compile('<tr>(.*?)</tr>', re.S)
         trs = find_tr.findall(html)
         for s in range(1, len(trs)):  #网站只有一页,有多少行匹配循环多少次
             find_ip = re.compile('<td>\s*?(\d+\.\d+\.\d+\.\d+)\s*?</td>',
                                  re.S)
             re_ip_address = find_ip.findall(trs[s])
             find_port = re.compile('<td>\s*?(\d+)\s*?</td>', re.S)
             re_port = find_port.findall(trs[s])
             for address, port in zip(re_ip_address, re_port):
                 address_port = address + ':' + port
                 yield address_port.replace(' ', '')
Пример #50
0
	def crawl_data5u(self):
		"""
		无忧代理
		:return:
		"""
		start_url = 'http://www.data5u.com'
		html = get_page(start_url)
		# print(html)
		ip_adress = re.compile(
			'<ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>'
		)
		# \s * 匹配空格,起到换行作用
		re_ip_adress = ip_adress.findall(str(html))
		for adress, port in re_ip_adress:
			result = f"{adress}:{port}"
			yield result.strip()
Пример #51
0
 def crawl_daili66(self, page_count=10):
     """获取66代理
     :param page_count: 页码
     :return 代理"""
     start_url = 'http://www.66ip.cn/{}.html'
     urls = [start_url.format(page) for page in range(1, page_count+1)]
     for url in urls:
         print('Crawling', url)
         html = get_page(url)
         if html:
             doc = pq(html)
             trs = doc('.containerbox table tr:gt(0)').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text()
                 port = tr.find('td:nth-child(2)').text()
                 yield ':'.join([ip, port])
Пример #52
0
 def crawl_xicidaili(self):
     for i in range(1, 3):
         start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
         print('Crawling', start_url)
         html = get_page(start_url)
         if html:
             find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
             trs = find_trs.findall(html)
             for tr in trs:
                 find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                 re_ip_address = find_ip.findall(tr)
                 find_port = re.compile('<td>(\d+)</td>')
                 re_port = find_port.findall(tr)
                 for address, port in zip(re_ip_address, re_port):
                     address_port = address + ':' + port
                     yield address_port.replace(' ', '')
Пример #53
0
 def crawl_kuaidaili(self):
     '''
     获取快代理
     :return:
     '''
     for i in range(1, 6):
         start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
         html = get_page(start_url)
         if html:
             ip_address = re.compile('<td data-title="IP">(.*?)</td>')
             re_ip_address = ip_address.findall(html)
             port = re.compile('<td data-title="PORT">(.*?)</td>')
             re_port = port.findall(html)
             for address, port in zip(re_ip_address, re_port):
                 address_port = address + ':' + port
                 yield address_port.replace(' ', '')
Пример #54
0
 def crawl_ip3366(self):
     '''
     获取ip3366代理
     :return:
     '''
     for page in range(1, 7):
         start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(
             page)
         html = get_page(start_url)
         if html:
             doc = pq(html)
             trs = doc('#list table tbody tr').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text()
                 port = tr.find('td:nth-child(2)').text()
                 yield ':'.join([ip, port])
Пример #55
0
 def crawl_kuaidaili(self, page_count=5):
     """
     代理名称:快代理
     :param page_count: 页码
     :return: proxy
     """
     base_url = 'https://www.kuaidaili.com/free/inha/{}/'
     urls = [base_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         time.sleep(1)
         html = etree.HTML(get_page(url))
         proxy_trs = html.xpath("//div[@id='list']/table//tr")
         for i in range(1, len(proxy_trs)):
             ip = proxy_trs[i].xpath("./td[1]")[0].text
             port = proxy_trs[i].xpath("./td[2]")[0].text
             yield ":".join([ip, port])
Пример #56
0
def scrape(url):
    while url is not None:
        # sometimes, requests loads a malformed page
        # we know all pages exist with PNG data, next url, etc
        # keep retrying until we get it
        while True:
            try:
                page = get_page(url)
                id = get_page_identifier(url)
                save_poem(page, id)
                url = next_page_url(page)
                print(url)
                delay()
            except Exception as error:
                print('Failure at volume {}, issue {}, page {}: {}'.format(
                    id['volume'], id['issue'], id['page'], error))
Пример #57
0
 def crawl_daili66(self, page_count=34):
     proxy = []
     start_url = "http://www.66ip.cn/areaindex_{}/1.html"
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         print('Crawling', url)
         html = get_page(url)
         if html:
             soup = BeautifulSoup(html.decode("gbk"), 'lxml')
             trs = soup.find("div",
                             class_="containerbox boxindex").find_all("tr")
             for tr in trs[1:]:
                 IP = tr.find_all("td")[0].get_text()
                 PORT = tr.find_all("td")[1].get_text()
                 TYPE = "http"
                 proxy.append([IP, PORT, TYPE])
     return proxy
Пример #58
0
def index(request):
    '''首页及分页'''
    try:
        page = int(request.GET.get('page', '1'))
    except ValueError:
        page = 1

    c = {
        'posts': utils.get_page(Post.objects.all(), page),
        'settings': global_settings,
        'no_sidebar': False,
    }
    c.update(common_response(request))

    return render_to_response('%s/index.html' % THEME,
                              c,
                              context_instance=RequestContext(request))
Пример #59
0
 def crawl_daili66(self, page_count=4):
     """
     Get proxies from daili66
     :param page_count: Page number
     :return: proxy
     """
     start_url = 'http://www.66ip.cn/{}.html'
     urls = [start_url.format(page) for page in range(1, page_count+1)]
     for url in urls:
         html = get_page(url)
         if html:
             doc = pq(html)
             trs = doc('.containerbox table tr:gt(0)').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text()
                 port = tr.find('td:nth-child(2)').text()
                 yield ':'.join([ip, port])
Пример #60
0
 def crawl_daili66(self, page_count=4):
     '''
     获取代理66
     '''
     t_url = 'https://www.66ip.cn/{}.html'
     # 抓取4页
     urls = [t_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         print('Crawing', url)
         html = get_page(url)
         if html:
             doc = pq(html)
             trs = doc('.containerbox table tr:gt(0)').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text()
                 port = tr.find('td:nth-child(2)').text()
                 yield ':'.join([ip, port])