Exemplo n.º 1
0
    def run(self):
        self._login()

        url_course = self.url_prefix + "xskbcx.aspx?xh=" + self.username
        r_course = requests.get(url_course, cookies=self.cookies)
        '''parse'''
        if u"调查问卷".encode(self.charset) in r_course.content:
            raise GrabError("无法抓取您的课程,请先填写教务网调查问卷。")
        strainer = SoupStrainer("table", id="xsgrid")
        soup = BeautifulSoup(r_course.content, parse_only=strainer)
        rows = soup.select("tr")
        courses = []
        for r in rows:
            if r.has_key('class') and r['class'] == ["datagridhead"]:
                continue

            cols = r.select("td")
            semester_text = cols[3].get_text(strip=True)
            time_texts = [text for text in cols[4].stripped_strings]
            locations = [text for text in cols[5].stripped_strings]

            lessons = self.get_lessons(time_texts, locations, semester_text)

            course = {
                'original_id': cols[0].get_text(strip=True),
                'name': cols[1].get_text(strip=True),
                'teacher': cols[2].get_text(strip=True),
                'lessons': lessons,
            }
            courses.append(course)
        self.courses = courses
        return pretty_format(courses)
Exemplo n.º 2
0
    def run(self):
        self._login()

        url_course = self.url_prefix + "xskbcx.aspx?xh=" + self.username
        r_course = requests.get(url_course, cookies=self.cookies)

        '''parse'''
        if u"调查问卷".encode(self.charset) in r_course.content:
            raise GrabError("无法抓取您的课程,请先填写教务网调查问卷。")
        strainer = SoupStrainer("table", id="xsgrid")
        soup = BeautifulSoup(r_course.content, parse_only=strainer)
        rows = soup.select("tr")
        courses = []
        for r in rows:
            if r.has_key('class') and r['class'] == ["datagridhead"]:
                continue

            cols = r.select("td")
            semester_text = cols[3].get_text(strip=True)
            time_texts = [text for text in cols[4].stripped_strings]
            locations = [text for text in cols[5].stripped_strings]

            lessons = self.get_lessons(time_texts, locations, semester_text)

            course = {
                'original_id': cols[0].get_text(strip=True),
                'name': cols[1].get_text(strip=True),
                'teacher': cols[2].get_text(strip=True),
                'lessons': lessons,
            }
            courses.append(course)
        self.courses = courses
        return pretty_format(courses)
Exemplo n.º 3
0
    def run(self):
        self._login()

        url_course = self.url_prefix + 'xkAction.do?actionType=6'
        r_course = requests.get(url_course, cookies=self.cookies)

        soup = BeautifulSoup(r_course.content.replace('class', 'id'))
        soup.prettify()
        soup = soup.find_all("table")[7]

        rows = soup.select("tr")

        courses = []
        for r in rows:
            if r.has_key('id') and r['id'] != "odd":
                continue

            cols = r.select("td")

            if cols == []:
                continue

            location = cols[15].get_text(strip=True) + ' ' + cols[16].get_text(strip=True) + ' ' + cols[17].get_text(strip=True)
            teacher = self.get_teachers(cols[7].get_text(strip=True))
            weeks_text = cols[11].get_text(strip=True)
            day_text = cols[12].get_text(strip=True)
            start_end_text = cols[13].get_text(strip=True) + '-' + str(int(cols[13].get_text(strip=True)) + int(cols[14].get_text(strip=True)) - 1)
            
            lessons = self.get_lessons(weeks_text, day_text, start_end_text, location)

            course = {
                'original_id': cols[1].get_text(strip=True),
                'name': cols[2].get_text(strip=True),
                'teacher': teacher,
                'lessons': lessons,
            }
            courses.append(course)

        self.courses = courses
        return pretty_format(courses)
Exemplo n.º 4
0
            course = {
                'original_id': row[1],
                'name': row[2],
                'credit': str(float(row[4])),
                'teacher': teacher,
                'lessons': [lesson],
            }

            if courses:
                if (course['original_id'] == courses[-1]['original_id']
                        and course['teacher'] == courses[-1]['teacher']
                        and prev_code_name == code_name):
                    courses[-1]['lessons'].append(lesson)
                else:
                    courses.append(course)
            else:
                courses.append(course)

            prev_code_name = code_name

        #print courses
        total_courses = len(courses)
        print "Converted %d courses. Writing to yaml...\n" % total_courses
        if courses != []:
            with open(('bjtu.yaml'), 'w') as yaml_file:
                yaml_file.write(pretty_format(courses))

except IOError:
    print "Cannot open data/bjtu.csv, exiting."
    exit()
Exemplo n.º 5
0
    def grab_all(self):
        self._local_setup()
        self.next_url = 'http://portal.ruc.edu.cn/cas/login?service=http%3A%2F%2Fportal.ruc.edu.cn%2Fidc%2Feducation%2Fselectcourses%2Fresultquery%2FResultQueryAction.do%3Fmethod%3DforwardAllQueryXkjg'
        self._login()

        r_cookies = requests.post(self.next_url, cookies=self.cookies, verify=False)
        content = r_cookies.content.decode(self.charset)
        self.cookies = r_cookies.cookies

        '''parser, start.'''

        ''' - get colleges'''
        strainer_colleges = SoupStrainer("select", id="condition_yx")
        soup_colleges = BeautifulSoup(r_cookies.content.decode('gbk'), parse_only=strainer_colleges)
        colleges = [option['value'] for option in soup_colleges.select("option") if option['value']]
        colleges_name = [option.get_text() for option in soup_colleges.select("option") if option['value']]
        pretty_print(colleges_name)
        print "{0} colleges.".format(len(colleges))

        ''' - iter colleges'''
        total_courses = 0
        for i, college in enumerate(colleges):
            courses = []
            url_courses = 'http://portal.ruc.edu.cn/idc/education/selectcourses/resultquery/ResultQueryAction.do'
            '''get courses'''
            for j in range(1, 15):
                data = {
                    'method': "allJxb",
                    'condition_xnd': "2012-2013",
                    'condition_xq': "1",
                    'condition_yx': college.encode('gbk'),
                    'isNeedInitSQL': "true",
                    'ksj1': j,
                    'ksj2': j,
                }
                r_courses = requests.post(url_courses, data=data, cookies=self.cookies)
                content = r_courses.content.decode('gbk')

                soup_courses = BeautifulSoup(content)
                rows = soup_courses.find_all("row")

                if len(rows) == 1:
                    continue

                for r in rows:
                    teacher = r.select("xm")[0].get_text(strip=True).replace('/', ',')
                    time_and_location_texts = r.select("sksj > tagbr")

                    lessons = self.get_lessons(time_and_location_texts)

                    course = {
                        'original_id': r.select("jxbh")[0].get_text(strip=True),
                        'name': r.select("kcmc")[0].get_text(strip=True),
                        'credit': str(float(r.select("xf")[0].get_text(strip=True))),
                        'teacher': teacher,
                        'lessons': lessons,
                    }
                    courses.append(course)

            print "#{0} {1}: {2} courses.".format(i, colleges_name[i].encode('utf8'), len(courses))
            if len(courses) == 0:
                continue
            total_courses += len(courses)
            output_dir = os.path.join(os.path.dirname(__file__), 'ruc')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if courses != []:
                with open(os.path.join(output_dir, colleges_name[i] + '.yaml'), 'w') as yaml_file:
                    yaml_file.write(pretty_format(courses))
        print "Done! Totally exported {0} courses.".format(total_courses)
Exemplo n.º 6
0
    def grab_all(self):
        # self._local_setup()
        # self._login()
        self._fake_login()

        url_courses = self.url_prefix + "jxrw_zd.aspx?xh=" + self.username
        '''get viewstate'''
        r_viewstate = requests.get(url_courses, cookies=self.cookies)
        result = re.search(
            '<input type="hidden" name="__VIEWSTATE" value="(.+)" />',
            r_viewstate.content)
        viewstate = result.group(1)

        print "Get viewstate: done."
        '''parser, start.'''
        ''' - get colleges'''
        strainer_colleges = SoupStrainer("select", id="ddlXY")
        soup_colleges = BeautifulSoup(r_viewstate.content.decode(self.charset),
                                      parse_only=strainer_colleges)
        colleges = [
            option['value'] for option in soup_colleges.select("option")
            if option['value']
        ]
        pretty_print(colleges)
        print "{} colleges.".format(len(colleges))
        ''' - iter colleges'''
        total_courses = 0
        for i, college in enumerate(colleges):
            '''get courses'''
            data = {
                '__EVENTTARGET': "",
                '__EVENTARGUMENT': "",
                '__VIEWSTATE': viewstate,
                'ddlXN': "2012-2013",
                'ddlXQ': "1",
                'ddlXY': college.encode(self.charset),
                'ddlZY': "",
                'ddlKC': "",
                'btnFilter': u' 查 询 '.encode(self.charset),
            }
            r_courses = requests.post(url_courses,
                                      data=data,
                                      cookies=self.cookies)
            content = r_courses.content.decode(self.charset)

            strainer_courses = SoupStrainer("table", id="DBGrid")
            soup_courses = BeautifulSoup(content, parse_only=strainer_courses)
            rows = soup_courses.select("tr")

            courses = []
            for r in rows:
                if r.has_key('class') and r['class'] == ["datagridhead"]:
                    continue

                cols = r.select("td")
                semester_text = cols[0].get_text(strip=True)
                teacher = cols[7].get_text(strip=True).replace('/', ',')
                time_texts = map(string.strip, cols[8].get_text().split(';'))
                locations = map(string.strip, cols[9].get_text().split(';'))

                lessons = self.get_lessons(time_texts, locations,
                                           semester_text)

                course = {
                    'original_id': cols[3].get_text(strip=True),
                    'name': cols[4].get_text(strip=True),
                    'credit': float(cols[6].get_text(strip=True)),
                    'teacher': teacher,
                    'lessons': lessons,
                }
                courses.append(course)

            print "#{} {}: {} courses.".format(i, college.encode("utf8"),
                                               len(courses))
            total_courses += len(courses)
            output_dir = os.path.join(os.path.dirname(__file__), 'zju')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            with open(os.path.join(output_dir,
                                   str(i) + '.yaml'), 'w') as yaml_file:
                yaml_file.write(pretty_format(courses))
            # with open(os.path.join(output_dir, str(i) + '.html'), 'w') as html_file:
            #     html_file.write(soup_courses.prettify().encode("utf8"))
        print "Done! Totally exported {} courses.".format(total_courses)
Exemplo n.º 7
0
            course = {
                'original_id': row[1],
                'name': row[2],
                'credit': str(float(row[4])),
                'teacher': teacher,
                'lessons': [lesson],
            }

            if courses:
                if (course['original_id'] == courses[-1]['original_id']
                  and course['teacher'] == courses[-1]['teacher']
                  and prev_code_name == code_name):
                    courses[-1]['lessons'].append(lesson)
                else:
                    courses.append(course)
            else:
                courses.append(course)

            prev_code_name = code_name

        #print courses
        total_courses = len(courses)
        print "Converted %d courses. Writing to yaml...\n" % total_courses
        if courses != []:
            with open(('bjtu.yaml'), 'w') as yaml_file:
                yaml_file.write(pretty_format(courses))

except IOError:
    print "Cannot open data/bjtu.csv, exiting."
    exit()
Exemplo n.º 8
0
    def grab_all(self):
        self._local_setup()
        self.next_url = 'http://portal.ruc.edu.cn/cas/login?service=http%3A%2F%2Fportal.ruc.edu.cn%2Fidc%2Feducation%2Fselectcourses%2Fresultquery%2FResultQueryAction.do%3Fmethod%3DforwardAllQueryXkjg'
        self._login()

        r_cookies = requests.post(self.next_url,
                                  cookies=self.cookies,
                                  verify=False)
        content = r_cookies.content.decode(self.charset)
        self.cookies = r_cookies.cookies
        '''parser, start.'''
        ''' - get colleges'''
        strainer_colleges = SoupStrainer("select", id="condition_yx")
        soup_colleges = BeautifulSoup(r_cookies.content.decode('gbk'),
                                      parse_only=strainer_colleges)
        colleges = [
            option['value'] for option in soup_colleges.select("option")
            if option['value']
        ]
        colleges_name = [
            option.get_text() for option in soup_colleges.select("option")
            if option['value']
        ]
        pretty_print(colleges_name)
        print "{0} colleges.".format(len(colleges))
        ''' - iter colleges'''
        total_courses = 0
        for i, college in enumerate(colleges):
            courses = []
            url_courses = 'http://portal.ruc.edu.cn/idc/education/selectcourses/resultquery/ResultQueryAction.do'
            '''get courses'''
            for j in range(1, 15):
                data = {
                    'method': "allJxb",
                    'condition_xnd': "2012-2013",
                    'condition_xq': "1",
                    'condition_yx': college.encode('gbk'),
                    'isNeedInitSQL': "true",
                    'ksj1': j,
                    'ksj2': j,
                }
                r_courses = requests.post(url_courses,
                                          data=data,
                                          cookies=self.cookies)
                content = r_courses.content.decode('gbk')

                soup_courses = BeautifulSoup(content)
                rows = soup_courses.find_all("row")

                if len(rows) == 1:
                    continue

                for r in rows:
                    teacher = r.select("xm")[0].get_text(strip=True).replace(
                        '/', ',')
                    time_and_location_texts = r.select("sksj > tagbr")

                    lessons = self.get_lessons(time_and_location_texts)

                    course = {
                        'original_id':
                        r.select("jxbh")[0].get_text(strip=True),
                        'name':
                        r.select("kcmc")[0].get_text(strip=True),
                        'credit':
                        str(float(r.select("xf")[0].get_text(strip=True))),
                        'teacher':
                        teacher,
                        'lessons':
                        lessons,
                    }
                    courses.append(course)

            print "#{0} {1}: {2} courses.".format(
                i, colleges_name[i].encode('utf8'), len(courses))
            if len(courses) == 0:
                continue
            total_courses += len(courses)
            output_dir = os.path.join(os.path.dirname(__file__), 'ruc')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if courses != []:
                with open(os.path.join(output_dir, colleges_name[i] + '.yaml'),
                          'w') as yaml_file:
                    yaml_file.write(pretty_format(courses))
        print "Done! Totally exported {0} courses.".format(total_courses)
Exemplo n.º 9
0
    def grab_all(self):
        self._local_setup()
        self._login()

        url_courses = self.url_prefix + 'courseSearchAction.do?temp=1'

        '''get TOKEN'''
        r_viewstate = requests.get(url_courses, cookies=self.cookies)
        result = re.search('<input type="hidden" name="org.apache.struts.taglib.html.TOKEN" value="(.+)">', r_viewstate.content)
        TOKEN = result.group(1)

        print "Get TOKEN: done."

        '''parser, start.'''

        ''' - get colleges'''
        strainer_colleges = SoupStrainer('select', id="xsjc")
        soup_colleges = BeautifulSoup(r_viewstate.content.decode('gbk').replace('name', 'id'), parse_only=strainer_colleges)
        colleges = [option['value'] for option in soup_colleges.select("option") if option['value']]
        pretty_print(colleges)
        print "{0} colleges.".format(len(colleges))

        ''' - iter colleges'''
        url_courses = self.url_prefix + 'courseSearchAction.do'
        total_courses = 0
        for i, college in enumerate(colleges):
            '''get courses'''
            showColumn = [u'kch#课程号'.encode('gbk'), u'kcm#课程名'.encode('gbk'), u'xf#学分'.encode('gbk'), u'skjs#教师'.encode('gbk'), u'zcsm#周次'.encode('gbk'), u'skxq#星期'.encode('gbk'), u'skjc#节次'.encode('gbk'), u'xqm#校区'.encode('gbk'), u'jxlm#教学楼'.encode('gbk'), u'jasm#教室'.encode('gbk'), u'kxh#课序号'.encode('gbk')]
            data = {
                'org.apache.struts.taglib.html.TOKEN': TOKEN.encode('gbk'),
                'pageNumber': "0".encode('gbk'),
                'actionType': "1".encode('gbk'),
                'xsjc': college.encode('gbk'),
                'pageSize': '1000'.encode('gbk'),
                'showColumn': showColumn,
            }
            r_courses = requests.post(url_courses, data=data, cookies=self.cookies)
            content = r_courses.content.decode('gbk')

            strainer_courses = SoupStrainer("table", id="titleTop2")
            soup_courses = BeautifulSoup(content.replace('class', 'id'), parse_only=strainer_courses)
            rows = soup_courses.select("tr")
            prev_code_name = '-1'

            courses = []
            for r in rows:
                if not r.has_key('id'):
                    continue

                cols = r.select("td")
                try:
                    test_text = cols[0].get_text(strip=True)
                except:
                    break
                teacher = self.get_teachers(cols[3].get_text(strip=True))
                weeks_text = cols[4].get_text(strip=True)
                day_text = cols[5].get_text(strip=True)
                start_end_text = cols[6].get_text(strip=True)
                location = cols[7].get_text(strip=True) + ' ' + cols[8].get_text(strip=True) + ' ' + cols[9].get_text(strip=True)
                lessons = self.get_lessons(weeks_text, day_text, start_end_text, location)
                code_name = cols[10].get_text(strip=True)

                course = {
                    'original_id': cols[0].get_text(strip=True),
                    'name': cols[1].get_text(strip=True),
                    'credit': str(float(cols[2].get_text(strip=True).replace('&nbsp;', ''))),
                    'teacher': teacher,
                    'lessons': lessons,
                }

                try:
                    last_course = courses.pop()
                except:
                    pass
                else:
                    if course['original_id'] == last_course['original_id'] and course['teacher'] == last_course['teacher'] and prev_code_name == code_name:
                        course['lessons'] = course['lessons'] + last_course['lessons']
                    else:
                        courses.append(last_course)

                prev_code_name = code_name
                courses.append(course)

            print "#{0} {1}: {2} courses.".format(i, college.encode("utf8"), len(courses))
            total_courses += len(courses)
            output_dir = os.path.join(os.path.dirname(__file__), 'bupt')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if courses != []:
                with open(os.path.join(output_dir, str(i) + '.yaml'), 'w') as yaml_file:
                    yaml_file.write(pretty_format(courses))
        print "Done! Totally exported {0} courses.".format(total_courses)
Exemplo n.º 10
0
    def grab_all(self):
        # self._local_setup()
        # self._login()
        self._fake_login()

        url_courses = self.url_prefix + "jxrw_zd.aspx?xh=" + self.username

        '''get viewstate'''
        r_viewstate = requests.get(url_courses, cookies=self.cookies)
        result = re.search('<input type="hidden" name="__VIEWSTATE" value="(.+)" />', r_viewstate.content)
        viewstate = result.group(1)

        print "Get viewstate: done."

        '''parser, start.'''

        ''' - get colleges'''
        strainer_colleges = SoupStrainer("select", id="ddlXY")
        soup_colleges = BeautifulSoup(r_viewstate.content.decode(self.charset), parse_only=strainer_colleges)
        colleges = [option['value'] for option in soup_colleges.select("option") if option['value']]
        pretty_print(colleges)
        print "{} colleges.".format(len(colleges))

        ''' - iter colleges'''
        total_courses = 0
        for i, college in enumerate(colleges):
            '''get courses'''
            data = {
                '__EVENTTARGET': "",
                '__EVENTARGUMENT': "",
                '__VIEWSTATE': viewstate,
                'ddlXN': "2012-2013",
                'ddlXQ': "1",
                'ddlXY': college.encode(self.charset),
                'ddlZY': "",
                'ddlKC': "",
                'btnFilter': u' 查 询 '.encode(self.charset),
            }
            r_courses = requests.post(url_courses, data=data, cookies=self.cookies)
            content = r_courses.content.decode(self.charset)

            strainer_courses = SoupStrainer("table", id="DBGrid")
            soup_courses = BeautifulSoup(content, parse_only=strainer_courses)
            rows = soup_courses.select("tr")

            courses = []
            for r in rows:
                if r.has_key('class') and r['class'] == ["datagridhead"]:
                    continue

                cols = r.select("td")
                semester_text = cols[0].get_text(strip=True)
                teacher = cols[7].get_text(strip=True).replace('/', ',')
                time_texts = map(string.strip, cols[8].get_text().split(';'))
                locations = map(string.strip, cols[9].get_text().split(';'))

                lessons = self.get_lessons(time_texts, locations, semester_text)

                course = {
                    'original_id': cols[3].get_text(strip=True),
                    'name': cols[4].get_text(strip=True),
                    'credit': float(cols[6].get_text(strip=True)),
                    'teacher': teacher,
                    'lessons': lessons,
                }
                courses.append(course)

            print "#{} {}: {} courses.".format(i, college.encode("utf8"), len(courses))
            total_courses += len(courses)
            output_dir = os.path.join(os.path.dirname(__file__), 'zju')
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            with open(os.path.join(output_dir, str(i) + '.yaml'), 'w') as yaml_file:
                yaml_file.write(pretty_format(courses))
            # with open(os.path.join(output_dir, str(i) + '.html'), 'w') as html_file:
            #     html_file.write(soup_courses.prettify().encode("utf8"))
        print "Done! Totally exported {} courses.".format(total_courses)