示例#1
0
 def _get_number_of_pages(self, url):
     tree = download_html_tree(url)
     number_of_pages = self.xnumber_pages(tree)
     if len(number_of_pages) == 0:
         return 1
     else:
         number = number_of_pages[0].text.strip()
         return int(number)
示例#2
0
 def _get_section_urls(self, channel_url):
     tree = download_html_tree(channel_url)
     links = self.xsection_urls(tree)
     section_urls = []
     for link in reversed(links):
         url = link.attrib['href']
         # change "browser" for "date"
         url = os.path.join(os.path.split(url)[0], 'date')
         url = urlparse.urljoin(channel_url, url)
         section_urls.append(url)
     return section_urls
示例#3
0
    def parse_entry(self, local_paths, url):
        load = ParserLoad()
        self._build_code_words(load)
        self._pre_parse_entry(load, local_paths, url)

        for i, local_path in enumerate(local_paths):
            path = os.path.join(settings.PROJECT_FS_ROOT, local_path)
            load.tree = download_html_tree(path)
            self._parse_entry(path, local_path, url, i, load)

        self._post_parse_entry(load)
示例#4
0
    def parse_entry(self, local_paths, url):
        load = ParserLoad()
        self._build_code_words(load)
        self._pre_parse_entry(load, local_paths, url)

        for i, local_path in enumerate(local_paths):
            path = os.path.join(settings.PROJECT_FS_ROOT, local_path)
            load.tree = download_html_tree(path)
            self._parse_entry(path, local_path, url, i, load)

        self._post_parse_entry(load)
示例#5
0
    def toc_download_section(self, model, section):
        page_url = section.url
        tree = download_html_tree(page_url)
        entry_urls = self._parse_toc_entries(page_url, tree)

        if self.reverse_entries:
            entry_urls.reverse()

        e_index = 1000 * section.index
        for entry_url in entry_urls:
            model.entries.append(TocEntry(e_index, entry_url))
            e_index += 1
        section.downloaded = True
示例#6
0
    def toc_download_section(self, model, section):
        page_url = section.url
        tree = download_html_tree(page_url)
        entry_urls = self._parse_toc_entries(page_url, tree)

        if self.reverse_entries:
            entry_urls.reverse()

        e_index = 1000 * section.index
        for entry_url in entry_urls:
            model.entries.append(TocEntry(e_index, entry_url))
            e_index += 1
        section.downloaded = True
示例#7
0
 def _get_number_of_pages(self, url):
     tree = download_html_tree(url)
     number_of_pages = self.xnumber_pages(tree)
     if len(number_of_pages) == 0:
         return 1
     else:
         number_element = number_of_pages[0]
         number = get_text(number_element)
         match = self.page_regex.search(number)
         if match:
             return int(match.group(1))
         else:
             return 1
示例#8
0
    def toc_download_section(self, model, section):
        pages = []
        page_url = section.url
        while page_url is not None:
            tree = download_html_tree(page_url)
            entry_urls = self._parse_toc_entries(page_url, tree)
            pages.append(entry_urls)
            page_url = self._get_next_toc_page(page_url, tree)

        entry_urls = self._sort_section_entries(pages)
        e_index = 1000 * section.index
        for entry_url in entry_urls:
            model.entries.append(TocEntry(e_index, entry_url))
            e_index += 1
        section.downloaded = True
示例#9
0
    def toc_download_section(self, model, section):
        pages = []
        page_url = section.url
        while page_url is not None:
            tree = download_html_tree(page_url)
            entry_urls = self._parse_toc_entries(page_url, tree)
            pages.append(entry_urls)
            page_url = self._get_next_toc_page(page_url, tree)

        entry_urls = self._sort_section_entries(pages)
        e_index = 1000 * section.index
        for entry_url in entry_urls:
            model.entries.append(TocEntry(e_index, entry_url))
            e_index += 1
        section.downloaded = True
示例#10
0
def create_filter_file(file_path, url):
    new_file_path = os.path.join(settings.PROJECT_FS_ROOT, file_path)
    if os.path.exists(new_file_path):
        mode = 'a'
    else:
        mode = 'w'

    with open(new_file_path, mode) as afile:
        tree = download_html_tree(url)
        package_name = get_package_name(tree)
        tables = xmember_tables(tree)
        for table in tables[1:-1]:
            for member in xmembers(table):
                member_string = "{0}.{1}".format(package_name, xtext(member))
                afile.write(member_string + '\n')
                print(member_string)
示例#11
0
def create_filter_file(file_path, url):
    new_file_path = os.path.join(settings.PROJECT_FS_ROOT, file_path)
    if os.path.exists(new_file_path):
        mode = 'a'
    else:
        mode = 'w'

    with open(new_file_path, mode) as afile:
        tree = download_html_tree(url)
        package_name = get_package_name(tree)
        tables = xmember_tables(tree)
        for table in tables[1:-1]:
            for member in xmembers(table):
                member_string = "{0}.{1}".format(package_name, xtext(member))
                afile.write(member_string + '\n')
                print(member_string)
示例#12
0
    def download_entry(self, entry, path):
        local_paths = []
        next_url = entry.url
        page_id = 0

        while next_url is not None:
            uid = get_safe_local_id(next_url, '_page{0}'.format(page_id))
            new_path = os.path.join(path, uid)
            download_file(next_url, new_path)
            relative_path = get_relative_url(new_path)
            local_paths.append(relative_path)
            tree = download_html_tree(new_path)
            page_id += 1
            next_url = self._get_next_entry_url(next_url, page_id, tree)

        entry.downloaded = True
        entry.local_paths = local_paths
示例#13
0
    def download_entry(self, entry, path):
        local_paths = []
        next_url = entry.url
        page_id = 0

        while next_url is not None:
            uid = get_safe_local_id(next_url, "_page{0}".format(page_id))
            new_path = os.path.join(path, uid)
            download_file(next_url, new_path)
            relative_path = get_relative_url(new_path)
            local_paths.append(relative_path)
            tree = download_html_tree(new_path)
            page_id += 1
            next_url = self._get_next_entry_url(next_url, page_id, tree)

        entry.downloaded = True
        entry.local_paths = local_paths