def downloadMyCourses(): import myHttplib import logging import os FORMAT= '%(asctime)-15s : %(levelname)s : %(module)s : %(lineno)d : %(message)s' logging.basicConfig(filename='hl2_logger.log', level=logging.INFO, format=FORMAT) logging.info(u""" ------------------------------Downloadin Courses-------------------------------- """) url_portal = 'http://hl2.bgu.ac.il/PortalCategories.asp?&test=&Opened02=yesyes' [content_portal, header_portal , url_portal_retrieved] = myHttplib.connect(url_portal, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_portal, encoding_portal] = myHttplib.getRoot(content_portal, header_portal) coursesList_raw = root_portal.xpath("id('TotalRecordsTR02')")[0] logging.info(u'found %d courses',coursesList_raw.__len__() - 2) coursesList = coursesList_raw[0].getchildren() for courseLine in coursesList[1:-1]: #remove the header and last row of the table courseName = courseLine[0][0].text if not os.path.isdir(courseName): os.mkdir(courseName) os.chdir(courseName) coursePage_raw = courseLine[0][0].get('onclick') coursePage_clean = coursePage_raw.split("'")[1] getCoursePage(coursePage_clean) os.chdir('..')
def getNews(): import logging logging.info(u""" ---------------------------------Getting news----------------------------------- """) import myHttplib url = 'http://hl2.bgu.ac.il/PortalCategories.asp' [content, header , url_retrieved] = myHttplib.connect(url, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root, encoding] = myHttplib.getRoot(content, header)
def login(): import logging FORMAT= '%(asctime)-15s : %(levelname)s : %(module)s : %(lineno)d : %(message)s' logging.basicConfig(filename='hl2_logger.log', level=logging.INFO, format=FORMAT) logging.info(u""" --------------------------------hl2_bgu.login----------------------------------- """) import myHttplib #import urllib url_login_base = "http://hl2.bgu.ac.il/" domain_hl = "hl2.bgu.ac.il" #get_login = {} #url_result = url_result_base + "?" + urllib.urlencode(get_result) url_login = url_login_base [content_login, header_login, url_retrieved_login] = myHttplib.connect(url_login, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_login, encoding_login] = myHttplib.getRoot(content_login, header_login) url_login2_raw = root_login[1][2][2].get("src") #url_login2_clean = url_login2_raw.split('"')[1] #url_login2_base = "http://hl2.bgu.ac.il/NewLoginFrames.asp" url_login2 = url_login_base + url_login2_raw [content_login2, header_login2, url_retrieved_login2] = myHttplib.connect(url_login2, ref_url = url_retrieved_login, domain = domain_hl, COOKIEFILE = "hl2_cookies.lwp") [root_login2, encoding_login2] = myHttplib.getRoot(content_login2, header_login2) logging.info(u"build request for login") inputs = myHttplib.getInputFields(root_login2) inputs["userID"] = "hagayd" inputs["password"] = "******" url_login3 = "http://hl2.bgu.ac.il/sso/login2.asp" [content_login3, header_login3, url_retrieved_login3] = myHttplib.connect(url_login3, post = inputs, ref_url = url_retrieved_login2, domain = domain_hl, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_login3, encoding_login3] = myHttplib.getRoot(content_login3, header_login3) logging.info(u"Not implemented - did the login Succeeded") return [root_login3, header_login3]
def download(root_result): import myHttplib import socket import logging url_download_base = "http://132.72.23.44/ac_files" #result_elm = root_result[1][0][9][0][1][3][0][0] result_elm=root_result.xpath("id('ctl00_ContentPlaceHolder1_GridView2')")[0] if result_elm == []: logging.info(u"no files...") return for i in range(1, result_elm.__len__()): relativePath_raw = result_elm[i][0][0].get('onclick') relativePath_clean = relativePath_raw.split("'")[1] url_download = url_download_base + relativePath_clean try: [content_result, header_result] = myHttplib.connect(url_download, COOKIEFILE = "maagar_cookies.lwp") except socket.error: #kind of retry... [content_result, header_result] = myHttplib.connect(url_download, COOKIEFILE = "maagar_cookies.lwp") localFileName = relativePath_clean[3:] localFileName = localFileName .replace("%2f","_") localFile = open(localFileName,"wb") localFile.write(content_result) localFile.close()
def main(courseNum): import logging FORMAT= '%(asctime)-15s : %(levelname)s : %(module)s : %(lineno)d : %(message)s' logging.basicConfig(filename='logger.log', level=logging.INFO, format=FORMAT) logging.info(u""" -----------------------------------new_entry------------------------------------ """) import myHttplib import urllib #url_req = "http://www.bgu4u.co.il/pages/%D7%9E%D7%90%D7%92%D7%A8_%D7%A1%D7%99%D7%9B%D7%95%D7%9E%D7%99%D7%9D.aspx" #url_maagar = "http://iriya.bgu4u.co.il/maagar/" url_result_base = "http://iriya.bgu4u.co.il/maagar/searchResults.aspx" get_result = {"q":"", "dep": 204, "Course" : courseNum , "PageResult" : 1, "chkSikomHartzaot": "true", "chkSikomTigulim" : "true", "chkTargilim" : "true", "chFormulas" : "true"} #logging.info(u"get requested content") #[content_req, header_req] = myHttplib.connect(url_req, COOKIEFILE = "maagar_cookies.lwp") #[root_req, encoding_req] = myHttplib.getRoot(content_req, header_req) #ucontent = unicode(content, encoding) #unicode_str = content.decode(encoding) #I think its better #text_req = etree.tostring(root_req, encoding = 'utf-8') #logging.info(u"get maagar page") #[content_maagar, header_maagar] = myHttplib.connect(url_maagar, COOKIEFILE = "maagar_cookies.lwp") #[root_maagar, encoding_maagar] = myHttplib.getRoot(content_maagar, header_maagar) #inputs_maagar_list = myHttplib.getInputFields(root_maagar, encoding_maagar) #logging.info(u"inputs_maagar_list: %s", inputs_maagar_list) logging.info(u"get result page") url_result = url_result_base + "?" + urllib.urlencode(get_result) [content_result, header_result, url_retrieved_result] = myHttplib.connect(url_result, COOKIEFILE = "maagar_cookies.lwp") [root_result, encoding_result] = myHttplib.getRoot(content_result, header_result) logging.info(u"start downloading") download(root_result) #inputs_result = myHttplib.getInputFields(root_result) #text_maagar = etree.tostring(root_maagar, encoding = 'utf-8') #print text_req.decode('utf-8') #print "---------------" #print text_maagar.decode('utf-8') logging.info(u"ended gracefully") return None
def downloadPage(sid): import myHttplib import logging import random from math import floor logging.info(u'downloading directory with sid: %s', sid) url_base = "http://hl2.bgu.ac.il" url_subjectSkin_path = "/bareket/SubjectSkin.asp" rand = random.random() url_subjectSkin_param = "?sid=" + str(sid) + "&rand=" + str(rand) url_subjectSkin = url_base + url_subjectSkin_path + url_subjectSkin_param url_subjectSkin_param_post = url_subjectSkin_param[1:] [content_subjectSkin, header_subjectSkin , url_subjectSkin_retrieved] = \ myHttplib.connect(url_subjectSkin, post = url_subjectSkin_param_post, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_subjectSkin, encoding_subjectSkin] = \ myHttplib.getRoot(content_subjectSkin, header_subjectSkin) url_ItemMenu_path = "/BareketNet/ItemMenu.aspx" myrnd = floor(random.random() * (32222-10000)) dotat = str(myrnd).find('.') myrnd = str(myrnd)[:dotat] cookie_subjectSkin = header_subjectSkin.dict["set-cookie"] logging.info(u'check for vcCourseGUID if cant find return') vcCourseGUID_inx = cookie_subjectSkin.find('vcCourseGUID') if vcCourseGUID_inx == -1: logging.info(u'cant find vcCourseGUID ') return #can be due to multiple cookie set vcCourseGUID_end_inx2 = cookie_subjectSkin[vcCourseGUID_inx:].find('&') if vcCourseGUID_end_inx2 == -1: vcCourseGUID_end_inx = cookie_subjectSkin[vcCourseGUID_inx:].find(';') else: vcCourseGUID_end_inx = vcCourseGUID_end_inx2 vcCourseGUID = cookie_subjectSkin[vcCourseGUID_inx + 13: vcCourseGUID_inx + vcCourseGUID_end_inx] url_ItemMenu_param = "?TreePressed=yes&sid=" + str(sid) +\ "&rsid=0&random=" + myrnd + "&handler=EditHandler&vcCourseGuid=" + \ vcCourseGUID + "&enableSkin=0&LinkBoardSID=" url_ItemMenu = url_base + url_ItemMenu_path + url_ItemMenu_param [content_ItemMenu, header_ItemMenu , url_ItemMenu_retrieved] = \ myHttplib.connect(url_ItemMenu, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_ItemMenu, encoding_ItemMenu] = \ myHttplib.getRoot(content_ItemMenu, header_ItemMenu) logging.info(u'getting item list') items = root_ItemMenu.xpath("id('dgItemsHolder')") for item in items[0][0][1:]: item_type = item[2][0][0][0][0][0][0].text item_date = item[3][0].text item_info = item[0][0].get('alt') item_page = item[1][0][0][0][0][0].get('href') item_openLink = item[1][0][0][0][0][0].get('onclick') [subjectID, itemID, sentfrom, courseGUID, courseIDorig, courseID,d1, UserGUID, headertype] = item_openLink[19:-2].split(',') logging.info(u'item_type:%s, item_date:%s, item_info:%s,item_page:%s ',item_type,item_date,item_info,item_page) logging.info(u'subjectID:%s, itemID:%s, sentfrom:%s, courseGUID:%s, courseIDorig:%s, courseID:%s,\ UserGUID:%s, headertype:%s ',subjectID, itemID, sentfrom, courseGUID, courseIDorig, courseID, UserGUID, headertype) subjectID = subjectID[:-1] itemID = itemID[1:-1] sentfrom = sentfrom[1:-1] courseGUID = courseGUID[1:-1] courseIDorig = courseIDorig[1:-1] courseID = courseID[1:-1] UserGUID = UserGUID[1:-1] headertype = headertype[1:-1] cmdItemOpen(myrnd, subjectID, itemID, sentfrom, courseGUID, courseIDorig, headertype)
def cmdItemOpen(myrnd, subjectID, itemID, sentfrom, courseGUID, courseIDorig, headertype): import logging logging.info(u""" ---------------------------------download_item---------------------------------- """) logging.info(u"(myrnd:%s, subjectID:%s, itemID:%s, sentfrom:%s, courseGUID:%s, courseIDorig:%s, headertype:%s",myrnd, subjectID, itemID, sentfrom, courseGUID, courseIDorig, headertype); import myHttplib import os import urllib url_base = "http://hl2.bgu.ac.il" #the browser connect first to temp2 = "http://hl2.bgu.ac.il/Bareket/ShowOriginalItemType.asp?vcCourseID=54767&iid=379529" url_ShowItemByType_path = "/bareket/ShowItemByType.asp" url_param = '?random=' + myrnd + '&sid=' + subjectID + '&iid=' + itemID \ + '&sentfrom=' + sentfrom + '&vcCourseGuid=' + courseGUID + '&vcCourseID=' + courseIDorig \ + '&headertype=' + headertype + 'resizable=no,scrollbars=yes,height=700,width=680,top=58,left=200' url_ShowItemByType = url_base + url_ShowItemByType_path + url_param [content_ShowItemByType, header_ShowItemByType , url_ShowItemByType_retrieved] = \ myHttplib.connect(url_ShowItemByType, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_ShowItemByType, encoding_ShowItemByType] = \ myHttplib.getRoot(content_ShowItemByType, header_ShowItemByType) logging.info(u"CleanPermission") url_CleanPermission_path = "/bareket/CleanPermission.asp" url_CleanPermission = url_base + url_CleanPermission_path + url_param [content_CleanPermission, header_CleanPermission , url_CleanPermission_retrieved] = \ myHttplib.connect(url_CleanPermission, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") url_ShowItemByType2_path = "/bareket/ShowItemByType2.asp" url_ShowItemByType2 = url_base + url_ShowItemByType2_path + url_param [content_ShowItemByType2, header_ShowItemByType2 , url_ShowItemByType2_retrieved] = \ myHttplib.connect(url_ShowItemByType2, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_ShowItemByType2, encoding_ShowItemByType2] = \ myHttplib.getRoot(content_ShowItemByType2, header_ShowItemByType2) script_text = root_ShowItemByType2[1].text url_Pos = script_text.find("frames['ItemBody'].window.location.href=") url_startPos = script_text.find('"',url_Pos) url_endPos = script_text.find('"',url_startPos+1) url_item_path = "http://hl2.bgu.ac.il/bareket/" + script_text[url_startPos+1:url_endPos] [content_item_path, header_item_path , url_item_path_retrieved] = \ myHttplib.connect(url_item_path, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_item_path, encoding_item_path] = \ myHttplib.getRoot(content_item_path, header_item_path) url_item_raw = root_item_path[1][0][3].text url_item = url_item_raw.split("'")[1] logging.info(u"downloading: |%s|", url_item) [content_item, header_item , url_item_retrieved] = \ myHttplib.connect(url_item, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") if url_item_retrieved == None: logging.warning(u"can't get url %s", url_item) return fileName = urllib.unquote(url_item[url_item.rfind("/")+1:]) if not os.path.isfile(fileName): try: logging.info(u"trying to save to: %s", fileName) file_handler = open(fileName, 'wb') except IOError, e: logging.error(u"faild open file reason: %s", e) print e return file_handler.write(content_item) file_handler.close()
def getCoursePage(url_coursePage): """ http://hl2.bgu.ac.il/Eclass/CourseFrames.asp?vcCourseGUID=&vcCourseID=54767&language=972 /eClass/Announcements.asp?vcCourseID=54767&vcCourseName=%E4%F7%F9%F8%20%E4%EB%E9%EE%E9 root_knowledge: http://hl2.bgu.ac.il/bareket/fromArik.asp?language=100&goto=ExerciseFrames.asp?user=student&vcCourseID=54767&vcCourseName=%E4%F7%F9%F8%20%E4%EB%E9%EE%E9 rootFolder - http://hl2.bgu.ac.il/bareketnet/toc.aspx?width=200&handler=EditHandler&DIR=RTL&RootDir=0xAF5700BEDA912846AB4F7B537A61E744&SID=&KBItemID=&KBItemDesc=&KBItemMetaData=&LinkPosition=&TestID=&LinkBUID=&ComeFrom=&EnableSubjectSkin=0 ?subFulder folderList - http://hl2.bgu.ac.il/BareketNet/ItemMenu.aspx?TreePressed=yes&sid=152493&rsid=0&random=2531&handler=EditHandler&vcCourseGuid=0xAF5700BEDA912846AB4F7B537A61E744&enableSkin=0&LinkBoardSID=#IAHREF4635769499 itemPage - http://hl2.bgu.ac.il/bareket/ShowItemByType.asp?random=4585&sid=152493&iid=463576&sentfrom=ExerciseMenu.asp&vcCourseGuid=AF5700BEDA912846AB4F7B537A61E744&vcCourseID=54767&headertype=2 parent.frames["vcCourseMain"].document.location.href="/bareket/fromArik.asp?language=100&vcCourseID=54767&goto=../eClass/administration.asp?Send=1&vcCourseGUID=AF5700BEDA912846AB4F7B537A61E744&vcParentGuid=&vcCourseName=%E4%F7%F9%F8+%E4%EB%E9%EE%E9 [1][1][0][1][10][2] הגשת ש.ב [1][1][0][1][8][2]a: פתרונות תרגילי בית [1][1][0][1][6][2] תרגילי בית [1][1][0][1][0][2] סילבוס [1][1][0][1][0][1]img: {'src': 'foldericon.gif', 'imgleid': '54767', 'imgdef': 'foldericon.gif', 'imgid': '2304698', 'onclick': "setMainStatus(event);setActiveNode('2304698','0','BE0057AF-91DA-4628-AB4F-7B537A61E744','54767','foldericon.gif',event);", 'imglnk': '0', 'imgleguid': 'BE0057AF-91DA-4628-AB4F-7B537A61E744'} [1][1][0][1][0][2]a: {'id': '2304698' [1][1][0][1][0][3]hidden: {'value': '2304698'} [1][1][0][1][0][7]hidden: {'value': 'BE0057AF-91DA-4628-AB4F-7B537A61E744'} [1][1][0][1][0][8]input: {'type': 'hidden', 'name': 'hidden', 'value': '2304698'} [1][1][0][1][0][12]input: {'type': 'hidden', 'name': 'hidden', 'value': 'BE0057AF-91DA-4628-AB4F-7B537A61E744'} [1][1][0][1][1]ul: {'class': 'hdn', 'id': 'ul2304698'} None None [1][1][0][1][2]li: {'class': 'kid', 'id': 'li152493'} [1][1][0][1][2][0]img: {'src': '/upload/misc/CssFiles/Images/plus.gif'} [1][1][0][1][2][1]img: {'src': 'foldericon.gif', 'imgleid': '54767', 'imgdef': 'foldericon.gif', 'imgid': '152493', 'onclick': "setMainStatus(event);setActiveNode('152493','0','BE0057AF-91DA-4628-AB4F-7B537A61E744','54767','foldericon.gif',event);", 'imglnk': '0', 'imgleguid': 'BE0057AF-91DA-4628-AB4F-7B537A61E744'} [1][1][0][1][2][2]a: {'id': '152493', 'href': 'javascript:void(0)', 'target': 'Main', 'onclick': 'javascript:NoOp()', 'title': u'\u05d4\u05e8\u05e6\u05d0\u05d5\u05ea'} הרצאות [1][1][0][1][2][3]hidden: {'value': '152493'} [1][1][0][1][2][7]hidden: {'value': 'BE0057AF-91DA-4628-AB4F-7B537A61E744'} [1][1][0][1][2][8]input: {'type': 'hidden', 'name': 'hidden', 'value': '152493'} rand:0.882085807621479 """ import logging logging.info(u""" ---------------------------------getCoursePage---------------------------------- """) import myHttplib url_base = "http://hl2.bgu.ac.il" url_coursePage = url_base + url_coursePage [content_coursePage, header_coursePage , url_coursePage_retrieved] = myHttplib.connect(url_coursePage, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_coursePage, encoding_coursePage] = myHttplib.getRoot(content_coursePage, header_coursePage) url_top_path = root_coursePage[1][0].get('src') url_top = url_base + "/Eclass/" + url_top_path [content_top, header_top , url_top_retrieved] = myHttplib.connect(url_top, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_top, encoding_top] = myHttplib.getRoot(content_top, header_top) url_knowledge_path = root_top[2][2][0][2][0].get('href') url_knowledge = url_base + url_knowledge_path [content_knowledge, header_knowledge , url_knowledge_retrieved] = myHttplib.connect(url_knowledge, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_knowledge, encoding_knowledge] = myHttplib.getRoot(content_knowledge, header_knowledge) scriptText_knowledge = root_knowledge[1][0][0].text url_knowledge2_path = scriptText_knowledge.split('"')[19] url_knowledge2 = url_base + '/bareket/' + url_knowledge2_path [content_knowledge2, header_knowledge2 , url_knowledge2_retrieved] = myHttplib.connect(url_knowledge2, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_knowledge2, encoding_knowledge2] = myHttplib.getRoot(content_knowledge2, header_knowledge2) url_rootFolder_path = root_knowledge2[2][1][1].get('src') url_rootFolder = url_base + url_rootFolder_path [content_rootFolder, header_rootFolder , url_rootFolder_retrieved] = myHttplib.connect(url_rootFolder, COOKIEFILE = "D:\\main\\Dropbox\\projects\\python\\downloader\\hl2_cookies.lwp") [root_rootFolder, encoding_rootFolder] = myHttplib.getRoot(content_rootFolder, header_rootFolder) ul_root = root_rootFolder.xpath("id('ulRoot')")[0] logging.info(u"getting course main dir") directoryHandler(ul_root) logging.info(u"getting course addresbook")