def _get_number_of_pages(self, url): tree = download_html_tree(url) number_of_pages = self.xnumber_pages(tree) if len(number_of_pages) == 0: return 1 else: number = number_of_pages[0].text.strip() return int(number)
def _get_section_urls(self, channel_url): tree = download_html_tree(channel_url) links = self.xsection_urls(tree) section_urls = [] for link in reversed(links): url = link.attrib['href'] # change "browser" for "date" url = os.path.join(os.path.split(url)[0], 'date') url = urlparse.urljoin(channel_url, url) section_urls.append(url) return section_urls
def parse_entry(self, local_paths, url): load = ParserLoad() self._build_code_words(load) self._pre_parse_entry(load, local_paths, url) for i, local_path in enumerate(local_paths): path = os.path.join(settings.PROJECT_FS_ROOT, local_path) load.tree = download_html_tree(path) self._parse_entry(path, local_path, url, i, load) self._post_parse_entry(load)
def toc_download_section(self, model, section): page_url = section.url tree = download_html_tree(page_url) entry_urls = self._parse_toc_entries(page_url, tree) if self.reverse_entries: entry_urls.reverse() e_index = 1000 * section.index for entry_url in entry_urls: model.entries.append(TocEntry(e_index, entry_url)) e_index += 1 section.downloaded = True
def _get_number_of_pages(self, url): tree = download_html_tree(url) number_of_pages = self.xnumber_pages(tree) if len(number_of_pages) == 0: return 1 else: number_element = number_of_pages[0] number = get_text(number_element) match = self.page_regex.search(number) if match: return int(match.group(1)) else: return 1
def toc_download_section(self, model, section): pages = [] page_url = section.url while page_url is not None: tree = download_html_tree(page_url) entry_urls = self._parse_toc_entries(page_url, tree) pages.append(entry_urls) page_url = self._get_next_toc_page(page_url, tree) entry_urls = self._sort_section_entries(pages) e_index = 1000 * section.index for entry_url in entry_urls: model.entries.append(TocEntry(e_index, entry_url)) e_index += 1 section.downloaded = True
def create_filter_file(file_path, url): new_file_path = os.path.join(settings.PROJECT_FS_ROOT, file_path) if os.path.exists(new_file_path): mode = 'a' else: mode = 'w' with open(new_file_path, mode) as afile: tree = download_html_tree(url) package_name = get_package_name(tree) tables = xmember_tables(tree) for table in tables[1:-1]: for member in xmembers(table): member_string = "{0}.{1}".format(package_name, xtext(member)) afile.write(member_string + '\n') print(member_string)
def download_entry(self, entry, path): local_paths = [] next_url = entry.url page_id = 0 while next_url is not None: uid = get_safe_local_id(next_url, '_page{0}'.format(page_id)) new_path = os.path.join(path, uid) download_file(next_url, new_path) relative_path = get_relative_url(new_path) local_paths.append(relative_path) tree = download_html_tree(new_path) page_id += 1 next_url = self._get_next_entry_url(next_url, page_id, tree) entry.downloaded = True entry.local_paths = local_paths
def download_entry(self, entry, path): local_paths = [] next_url = entry.url page_id = 0 while next_url is not None: uid = get_safe_local_id(next_url, "_page{0}".format(page_id)) new_path = os.path.join(path, uid) download_file(next_url, new_path) relative_path = get_relative_url(new_path) local_paths.append(relative_path) tree = download_html_tree(new_path) page_id += 1 next_url = self._get_next_entry_url(next_url, page_id, tree) entry.downloaded = True entry.local_paths = local_paths