Пример #1
0
 def get_config(self):
     """
     Get config and its associated model
     """
     (config, attention) = MODELS[self.model_type]
     config = config(**self.config)
     transformers.models.roberta.modeling_roberta.RobertaSelfAttention = attention
     return config
Пример #2
0
    def __init__(self, model_path='tmp/model', w2id_path='data/w2id.json'):

        with open(w2id_path) as f:
            self.w2id = json.load(f)

        conf = config()
        self.num_steps = conf.num_steps

        self.cutModel = lstm_model(conf)

        self.sess = tf.Session()
        saver = tf.train.Saver()
        saver.restore(self.sess, model_path)
Пример #3
0
#Instanciation object Redis
objRedis = initRedis()

#Va contenir le texte du SMS
dataTextSms = ""

#Défini si l'exécution doit continuer pour l'ensemble des scripts
objRedis.set('flagExecute_Treatment', 0)
objRedis.set('nameExecute_Treatment', 'INIT')
"""
On instancie l'objet gérant les configurations
Et on récupère les informations pour les placer dans Redis
Clé : config_*
"""
objConfig = config()
objConfig.getThis()
dataTextSms = dataTextSms + "Config OK \r\n"

# On definit le path du log
logging.basicConfig(filename=objRedis.get('config_path_log').decode("utf-8") +
                    time.strftime('%Y%m%d') + '_init_adsb.log',
                    level=logging.INFO)
"""
On instancie l'objet gérant les squawk
Et on récupère les informations pour les placer dans Redis
Clé : squawk
"""
objSquawk = squawk()
returnSquawk = objSquawk.setDataInRedis()
dataTextSms = dataTextSms + " " + returnSquawk.__str__() + " Squawk \r\n"
Пример #4
0
            doc_file_path = model.generate_path([main_path, "docs", doc_name])
            print("文档: \"{name}\" \"{link}\"".format(name=doc_name,
                                                     link=doc_link))
            doc_list.append((doc_link, doc_file_path))

        # 下载
        if config.Download:
            if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                model.aira2_download(info_list + video_list + doc_list)
                model.download_queue(session,
                                     srt_list,
                                     queue_length=config.Download_Queue_Length
                                     )  # 需要session或者有时间期限的
            else:  # 默认调用自建下载
                model.download_queue(session,
                                     info_list + video_list + srt_list +
                                     doc_list,
                                     queue_length=config.Download_Queue_Length)

    else:
        print("No course Id,Please check!")

    return


if __name__ == '__main__':
    course_url = ""
    # Loading config
    config = model.config("settings.conf", "xuetangx")
    main(course_url, config)
Пример #5
0
                                print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link))
                                srt_list.append((srt_eng_link, srt_file_path))
                        video_in_chapter_list[-1] += 1

                    if lesson_content_type == 3 and config.Download_Docs:  # Documentation
                        doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1))
                        doc_name = "{0}.pdf".format(lesson_name)
                        doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern])
                        doc_file_path = model.generate_path([doc_path, doc_name])
                        doc_list.append((doc_link, doc_file_path))
                        print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link))

            if config.Download:
                if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                    model.aira2_download(info_list + video_list)
                    # 需要session或者有时间期限的,仍然使用自建下载
                    model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length)
                else:  # 默认调用自建下载
                    model.download_queue(session, info_list + video_list + srt_list + doc_list,
                                         queue_length=config.Download_Queue_Length)
        else:
            err_message = re.search(r'message:(.+)\}\)', rdata).group(1)
            print("Error:{0},Please make sure you login by 163-email "
                  "and your \"Session-Cookies\" pair is right.".format(err_message))


if __name__ == '__main__':
    course_url = ""
    config = model.config("settings.conf", "icourse163")
    main(course_url, config=config)
Пример #6
0
def main(course_url):
    config = model.config("settings.conf", "icourse163")
    session = model.login(site="icourse163", conf=config)
    http_session_id = session.cookies["NTESSTUDYSI"]
    c_tid = re.search(r"(?:(learn)|(course))/(?P<id>(?P<c_id>[\w:+-]+)(\?tid=(?P<t_id>\d+))?)#?/?", course_url)

    # Download cache list
    main_list = []
    srt_list = []
    doc_list = []

    # handle the course_url links to Get right courseId and termId
    if c_tid:
        if c_tid.group("t_id"):  # 当使用者提供tid的时候默认使用使用者tid
            term_id = c_tid.group("t_id")
            info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('id'))
        else:  # 否则通过info页面重新获取最新tid
            term_id = None
            print("No termId which you want to download.Will Choose the Lastest term.")
            info_url = "http://www.icourse163.org/course/{id}#/info".format(id=c_tid.group('c_id'))  # 使用课程默认地址
        page_about = session.get(url=info_url)
        if page_about.url == page_about.request.url:  # 存在该课程
            # 当课程不存在的时候会302重定向到http://www.icourse163.org/,通过检查返回、请求地址是否一致判断
            page_about_bs = BeautifulSoup(page_about.text, "lxml")
            course_info_raw = page_about_bs.find("script", text=re.compile(r"termDto")).string.replace("\n", "")
            if term_id is None:  # 没有提供tid时候自动寻找最新课程信息
                term_id = re.search(r"termId : \"(\d+)\"", course_info_raw).group(1)
            # 获取课程信息
            course_page_title = re.search(r'(.+?)_(.+?)_(.+?)', page_about_bs.title.string)
            course_title = model.clean_filename(course_page_title.group(1))
            school = course_page_title.group(2)
            teacher = model.sort_teacher(page_about_bs.find_all('h3', class_="f-fc3"))
            folder = model.clean_filename('-'.join([course_title, school, teacher]))

            print("The Download INFO:\n"  # Output download course info
                  "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=info_url, folder=folder, id=term_id))

            main_path = model.generate_path([config.Download_Path, folder])

            info_img_link = page_about_bs.find("div", id="j-courseImg").img["src"]
            img_file_name = r"课程封面图-{title}.png".format(title=course_title)
            img_file_path = model.generate_path([main_path, img_file_name])
            print("课程封面图: {link}".format(link=info_img_link))
            main_list.append((info_img_link, img_file_path))

            # intro_video
            video_search = re.search(r"videoId : \"(\d+)\"", course_info_raw)
            if video_search:
                payload = {
                    'callCount': 1,
                    'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
                    'httpSessionId': http_session_id,
                    'c0-scriptName': 'CourseBean',
                    'c0-methodName': 'getLessonUnitPreviewVo',
                    'c0-id': 0,
                    'c0-param0': video_search.group(1),
                    'c0-param1': 1,
                    'batchId': random.randint(1000000000000, 20000000000000)
                }
                ask_video_url = "http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitPreviewVo.dwr"
                resp = session.post(url=ask_video_url, data=payload).text
                for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']:  # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl'
                    video_search_group = re.search(r's\d+.(?P<VideoType>' + str(k) + ')="(?P<dllink>.+?)";', resp)
                    if video_search_group:
                        info_video_link = video_search_group.group("dllink")
                        video_file_name = r"课程简介-{title}.mp4".format(title=course_title)
                        video_file_path = model.generate_path([main_path, video_file_name])
                        print("课程简介视频: {link}".format(link=info_video_link))
                        main_list.append((info_video_link, video_file_path))
                        break
        else:
            print("Not found this course in \"icourse163.org\",Check Please")
            return

        # Get course's chapter
        payload = {
            'callCount': 1,
            'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
            'httpSessionId': http_session_id,
            'c0-scriptName': 'CourseBean',
            'c0-methodName': 'getLastLearnedMocTermDto',
            'c0-id': 0,
            'c0-param0': term_id,
            'batchId': random.randint(1000000000000, 20000000000000)
        }
        cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'
        rdata = session.post(cs_url, data=payload, timeout=None).text

        if re.search(r"var s\d+={}", rdata):
            print("Generate Download information.")

            # Data cleaning Reg
            week_reg = re.compile(r"s\d+.contentId=null;"
                                  r".+s\d+.lessons=(?P<lessons>s\d+)"
                                  r".+s\d+.name=\"(?P<week_name>.+?)\"")
            chapter_reg = re.compile(r"s\d+.chapterId=\d+;"
                                     r".+s\d+.name=\"(?P<chapter_name>.+?)\"")
            lesson_reg = re.compile(r"s\d+.anchorQuestions=(null|s\d+);"
                                    r".+s\d+.contentId=(?P<contentId>\d+)"
                                    r".+s\d+.contentType=(?P<contentType>\d+)"
                                    r".+s\d+.id=(?P<id>\d+)"
                                    r".+s\d+.name=\"(?P<lesson_name>.+?)\"")

            # count_list
            week_list = []
            chapter_list = []
            video_in_chapter_list = []

            for line in rdata.splitlines():
                if re.match(week_reg, line):  # Week
                    week_re = re.search(week_reg, line)
                    week_name = model.clean_filename(model.raw_unicode_escape(week_re.group("week_name")))
                    week_list.append(week_name)
                if re.match(chapter_reg, line):  # Chapter
                    chapter_re = re.search(chapter_reg, line)
                    chapter_name = model.clean_filename(model.raw_unicode_escape(chapter_re.group("chapter_name")))
                    chapter_list.append(chapter_name)
                    print("\n", week_list[-1], chapter_list[-1])
                    video_in_chapter_list.append(0)
                if re.match(lesson_reg, line):
                    lesson_re = re.search(lesson_reg, line)
                    lesson_loc_pattern = model.generate_path([week_list[-1], chapter_list[-1]])

                    lesson_name = model.clean_filename(model.raw_unicode_escape(lesson_re.group("lesson_name")))
                    lesson_content_type = int(lesson_re.group("contentType"))

                    # prepare data and post
                    payload = {
                        'callCount': 1,
                        'scriptSessionId': '${scriptSessionId}' + str(random.randint(0, 200)),
                        'httpSessionId': http_session_id,
                        'c0-scriptName': 'CourseBean',
                        'c0-methodName': 'getLessonUnitLearnVo',
                        'c0-id': 1,
                        'c0-param0': lesson_re.group("contentId"),
                        'c0-param1': lesson_content_type,
                        'c0-param2': 0,
                        'c0-param3': lesson_re.group("id"),
                        'batchId': random.randint(1000000000000, 20000000000000)
                    }
                    cs_url = 'http://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr'

                    rdata = session.post(cs_url, data=payload, timeout=None).text
                    # Sort data depend on it's contentType
                    # 1 -> Video ,2 -> Test ,3 -> Docs ,4 -> Rich text ,5 -> Examination ,6 -> Discussion
                    if lesson_content_type == 1:  # Video
                        count = video_in_chapter_list[-1]
                        count_lesson_name = model.clean_filename("{0} {lesson}".format(count, lesson=lesson_name))
                        for k in ['mp4ShdUrl', 'mp4HdUrl', 'mp4SdUrl']:  # , 'flvShdUrl', 'flvHdUrl', 'flvSdUrl'
                            if re.search(r's\d+.{0}=".+?";'.format(k), rdata):
                                k_type = re.search("mp4(.+)Url", k).group(1)
                                video_file_name = "{0}.mp4".format(count_lesson_name)
                                if k_type != "Shd":
                                    video_file_name = "{0}_{type}.mp4".format(count_lesson_name, type=k_type)
                                video_link = re.search(r's\d+.' + str(k) + r'="(.+?\.mp4).+?";', rdata).group(1)
                                video_file_path = model.generate_path([main_path, lesson_loc_pattern, video_file_name])
                                main_list.append((video_link, video_file_path))
                                print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link))
                                break
                        # Subtitle
                        if config.Download_Srt:
                            srt_path = model.generate_path([main_path, "Srt", lesson_loc_pattern])
                            if re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(.+?)"', rdata):  # Chinese
                                srt_chs_re = re.search(r's\d+.name="\\u4E2D\\u6587";s\d+.url="(?P<url>.+?)"', rdata)
                                srt_file_name = "{0}.chs.srt".format(count_lesson_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                srt_chs_link = srt_chs_re.group("url")
                                print("字幕Chs: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_chs_link))
                                srt_list.append((srt_chs_link, srt_file_path))
                            if re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(.+?)"', rdata):  # English
                                srt_eng_re = re.search(r's\d+.name="\\u82F1\\u6587";s\d+.url="(?P<url>.+?)"', rdata)
                                srt_file_name = "{0}.eng.srt".format(lesson_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                srt_eng_link = srt_eng_re.group("url")
                                print("字幕Eng: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_eng_link))
                                srt_list.append((srt_eng_link, srt_file_path))
                        video_in_chapter_list[-1] += 1

                    if lesson_content_type == 3 and config.Download_Docs:  # Documentation
                        doc_link = str(re.search(r'textOrigUrl:"(.+?)"', rdata).group(1))
                        doc_name = "{0}.pdf".format(lesson_name)
                        doc_path = model.generate_path([main_path, "Docs", lesson_loc_pattern])
                        doc_file_path = model.generate_path([doc_path, doc_name])
                        doc_list.append((doc_link, doc_file_path))
                        print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link))

            if config.Download:
                if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                    model.aira2_download(main_list)
                    # 需要session或者有时间期限的,仍然使用自建下载
                    model.download_queue(session, srt_list + doc_list, queue_length=config.Download_Queue_Length)
                else:  # 默认调用自建下载
                    model.download_queue(session, main_list + srt_list + doc_list,
                                         queue_length=config.Download_Queue_Length)
        else:
            err_message = re.search(r'message:(.+)\}\)', rdata).group(1)
            print("Error:{0},Please make sure you login by 163-email "
                  "and your \"Session-Cookies\" pair is right.".format(err_message))
    else:
        print("No course Id,Please check!")
        return
Пример #7
0
def main(course_url):
    config = model.config("settings.conf", "xuetangx")  # Loading config
    session = model.login(site="xuetangx", conf=config)
    course_id_search = re.search(r"courses/(?P<id>.+)/(courseware|info|discussion|wiki|progress|about)", course_url)

    # Download cache list
    main_list = []
    srt_list = []
    doc_list = []

    if course_id_search:
        course_id = course_id_search.group("id")
        main_page = "http://www.xuetangx.com/courses/{course_id}".format(course_id=course_id)

        page_about_url = "{course_host}/about".format(course_host=main_page)
        page_about = session.get(url=page_about_url)
        if page_about.text.find("页面无法找到") == -1:  # if Exist
            page_about_bs = BeautifulSoup(page_about.text, "lxml")
            # load course info
            course_detail_bs = page_about_bs.find("section", class_="courseabout_detail")
            course_name_tag = course_detail_bs.find("h3", class_="courseabout_title")

            course_title = model.clean_filename(course_name_tag.get_text())
            school = course_name_tag.find_next("a").get_text()
            teacher = model.sort_teacher(
                page_about_bs.find("ul", class_="teacher_info").find_all("span", class_="name"))
            folder = model.clean_filename('-'.join([course_title, school, teacher]))

            print("The Download INFO:\n"  # Output download course info
                  "link:{url}\nCourse:{folder}\nid:{id}\n".format(url=page_about_url, folder=folder, id=course_id))

            main_path = model.generate_path([config.Download_Path, folder])

            video_box = course_detail_bs.find('div', class_='video_box')
            try:
                info_img_link = model.link_check("http://www.xuetangx.com", video_box['data-poster'])
                info_video_link = get_video(session, video_box["data-ccid"])
                if info_video_link:
                    video_file_name = r"课程简介-{title}.mp4".format(title=course_title)
                    video_file_path = model.generate_path([main_path, video_file_name])
                    print("课程简介视频: {link}".format(link=info_video_link))
                    main_list.append((info_video_link, video_file_path))
            except KeyError:
                info_img_link = model.link_check("http://www.xuetangx.com", video_box.img["src"])

            if info_img_link:
                img_file_name = r"课程封面图-{title}.jpg".format(title=course_title)
                img_file_path = model.generate_path([main_path, img_file_name])
                print("课程封面图: {link}".format(link=info_img_link))
                main_list.append((info_img_link, img_file_path))
        else:
            print("Not found this course in \"xuetangx.com\",Check Please")
            return

        # 获取课程参与信息及判断是否已经参加课程
        page_courseware = session.get(url="{0}/courseware".format(main_page))
        if page_courseware.url.find("about") == -1 and page_courseware.url.find("login") == -1:  # 成功获取目录
            # 这里根据url判断:
            # 1、如果登陆了,但是没有参加该课程,会跳转到 ../about页面
            # 2、如果未登录(或密码错误),会跳转到http://www.xuetangx.com/accounts/login?next=.. 页面
            print("Generate Download information.")

            # 处理courseware页面
            courseware_bs = BeautifulSoup(page_courseware.text, "lxml")
            chapter = courseware_bs.find_all("div", class_="chapter")

            for week in chapter:
                week_name = model.clean_filename(week.h3.a.string.strip())
                for lesson in week.ul.find_all("a"):
                    # 获取课程信息
                    lesson_name = model.clean_filename(lesson.p.string)  # 主标题
                    lesson_page = session.get(url="http://www.xuetangx.com{href}".format(href=lesson['href']),
                                              timeout=None)
                    lesson_bs = BeautifulSoup(lesson_page.text, "lxml")

                    tab_list = {}
                    for tab in lesson_bs.find_all("a", role="tab"):
                        tab_list[tab.get('id')] = re.search("(.+)", tab.get('title')).group(1)

                    seq_contents = lesson_bs.find_all('div', class_="seq_contents")

                    print("\n", week_name, lesson_name)

                    seq_video_content_len = 0
                    for seq in seq_contents:
                        if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text):
                            seq_video_content_len += 1

                    for i, seq in enumerate(seq_contents):
                        seq_name = lesson_name
                        seq_path = model.generate_path([main_path, week_name])
                        srt_path = model.generate_path([main_path, "srt", week_name])
                        doc_path = model.generate_path([main_path, "docs", week_name])
                        if seq_video_content_len > 1:  # 如果只有一个的话,就不用建立子文件夹了
                            seq_name_raw = model.clean_filename(tab_list[seq.get("aria-labelledby")])
                            seq_name = r"{0} {1}".format(i, seq_name_raw)
                            seq_path = model.generate_path([seq_path, lesson_name])
                            srt_path = model.generate_path([srt_path, lesson_name])
                            doc_path = model.generate_path([doc_path, lesson_name])

                        if re.search(r"data-type=[\'\"]Video[\'\"]", seq.text):  # 视频
                            lesson_ccsource = re.search(r"data-ccsource=[\'\"](.+)[\'\"]", seq.text).group(1)
                            video_link = get_video(session, lesson_ccsource)
                            video_file_name = "{0}.mp4".format(seq_name)
                            if video_link.find == -1:
                                video_file_name = "{0}_sd.mp4".format(seq_name)
                            video_file_path = model.generate_path([seq_path, video_file_name])
                            print("视频: \"{name}\" \"{link}\"".format(name=video_file_name, link=video_link))
                            main_list.append((video_link, video_file_path))

                            seq_bs = BeautifulSoup(seq.text, "lxml")
                            if config.Download_Srt and seq_bs.find("a", text="下载字幕"):  # 字幕
                                raw_link = seq_bs.find("a", text="下载字幕")["href"]
                                srt_link = model.link_check("http://www.xuetangx.com", raw_link)
                                srt_file_name = "{0}.srt".format(seq_name)
                                srt_file_path = model.generate_path([srt_path, srt_file_name])
                                print("字幕: \"{name}\" \"{link}\"".format(name=srt_file_name, link=srt_link))
                                srt_list.append((srt_link, srt_file_path))
                            if config.Download_Docs and seq_bs.find("a", text="下载讲义"):  # 讲义
                                raw_link = seq_bs.find("a", text="下载讲义")["href"]
                                doc_link = model.link_check("http://www.xuetangx.com", raw_link)
                                doc_file_name = model.clean_filename(doc_link.split("/")[-1])
                                doc_file_path = model.generate_path([doc_path, doc_file_name])
                                print("文档: \"{name}\" \"{link}\"".format(name=doc_file_name, link=doc_link))
                                doc_list.append((doc_link, doc_file_path))

        else:  # 未登陆成功或者没参加该课程
            print("Something Error,You may not Join this course or Enter the wrong password.")
            return

        # 处理info页面的课程讲义
        page_info = session.get(url="{0}/info".format(main_page))
        info_bs = BeautifulSoup(page_info.text, "lxml")
        doc_menu = info_bs.find("section", attrs={"aria-label": re.compile("讲义导航")})
        for each in doc_menu.find_all("a"):
            doc_name = each["href"].split("/")[-1]
            doc_link = model.link_check("http://www.xuetangx.com", each["href"])
            doc_file_path = model.generate_path([main_path, "docs", doc_name])
            print("文档: \"{name}\" \"{link}\"".format(name=doc_name, link=doc_link))
            doc_list.append((doc_link, doc_file_path))

        # 下载
        if config.Download:
            if config.Download_Method == "Aria2":  # 这里是调用aria2的下载
                model.aira2_download(main_list + doc_list)
                model.download_queue(session, srt_list, queue_length=config.Download_Queue_Length)  # 需要session或者有时间期限的
            else:  # 默认调用自建下载
                model.download_queue(session, main_list + srt_list + doc_list,queue_length=config.Download_Queue_Length)

    else:
        print("No course Id,Please check!")

    return
Пример #8
0
def main():
    conf = config()
    conf.batch_size = 50
    conf.num_steps = 50
    train(conf,1000)
Пример #9
0
    print('  Word 编号:       {}'.format([i for i in answer_logits if i != pad]))
    print('  Response Words: {}'.format(
        " ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))


if __name__ == '__main__':
    with open(r'data/letters_source.txt', 'r', encoding='utf-8') as f:
        source_data = f.read()

    with open(r'data/letters_target.txt', 'r', encoding='utf-8') as f:
        target_data = f.read()

    print(source_data.split('\n')[:10])
    print(target_data.split('\n')[:10])

    # 构造映射表
    source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
    target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

    # 对字母进行转换
    source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>'])
                   for letter in line] for line in source_data.split('\n')]
    target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>'])
                   for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data.split('\n')]

    config = config()
    model = Seq2Seq(config, target_letter_to_int, source_letter_to_int)
    train(config, model, source_int, target_int)