def boards_xml(self): # issue, download for board, url in self.urls.items(): is_download = board == 'download' get_board_request = requests.get(url, cookies=self.cookies) parsed_board = making_soup(get_board_request.content, 'xml') doc_ids = [ id_tag.get_text() for id_tag in parsed_board.findAll(self.ID_TAGS[is_download]) ] progress_bar = tqdm(doc_ids) xml_path = os.path.join(self.paths[is_download], self.SUB_DIRS[1], board if not is_download else '') if not os.path.exists(xml_path) and not is_download: os.mkdir(xml_path) for doc_id in progress_bar: progress_bar.set_description('Now making {0}.xml and ' '{0}.json of {1}'.format( doc_id, board)) fn = doc_id + '.xml' doc_req_url = '{0}/{1}/{2}.xml'.format(self.project_url, board, doc_id) doc_requests = requests.get(doc_req_url, cookies=self.cookies) if not ok_code.match(str(doc_requests.status_code)): logging.error('{0} HAS REQUEST ERROR {1}'.format( doc_id, doc_requests.status_code)) continue # Precaution of xml encoding error xml_bytes = doc_requests.content.decode('utf-8', 'replace') parsed_xml = xml_bytes.replace(' ', '\n') with open(os.path.join(xml_path, fn), 'w', encoding='utf-8') as xml_file: xml_file.write(parsed_xml) soup = making_soup(parsed_xml, 'xml') if not is_download: self.make_issue(board, soup) else: self.make_download(doc_id, soup)
def create_url(self): urls = dict() types = ['issue', 'forum', 'download'] for parse_type in types: # Board and issue list url = '{0}/{1}'.format(self.project_url, parse_type) r = requests.get(url, cookies=self.cookies) # HTML parsing soup = making_soup(r.content, 'html') # Get selected class by each parse_type cond_class = 'menu_{0} on selected'.format(parse_type) class_list = soup.find(class_=cond_class) if class_list is not None: for a in class_list.ul.find_all('a'): text = a.get_text() name = a['href'].split('/projects/' + self.name + '/')[1] urls[text] = '{0}/{1}.xml'.format(self.project_url, name) else: urls[parse_type] = '{0}.xml'.format(url) return urls
def wiki(self): wiki_path = os.path.join(self.path, ISSUES_DIR, self.SUB_DIRS[0]) attach_file_path = os.path.join(wiki_path, ISSUE_ATTACH_DIR) if not os.path.exists(attach_file_path): os.mkdir(attach_file_path) project_news_item = self.project_main_soup.find('ul', class_='tab-small')\ .findAll('ul') wiki_pages = dict() for a_tag in project_news_item[2].findAll('a'): url = self.url + a_tag['href'] + '?action=edit' wiki_request = requests.get(url, cookies=self.cookies).content doc_name = a_tag['title'] # Except error for private wiki # Private wiki cannot get text area tag try: wiki_content = making_soup(wiki_request, 'html').textarea.get_text() except AttributeError: wiki_request = requests.get(self.url + a_tag['href'], cookies=self.cookies).content wiki_content = making_soup(wiki_request, 'html').find('div', id='mycontent') wiki_pages[doc_name] = str(wiki_content) with open(os.path.join(wiki_path, doc_name) + '.md', 'w', encoding='utf-8') as wiki_doc: wiki_doc.write(str(wiki_content)) return wiki_pages
def milestones(self): milestone_url = self.project_url + '/milestone.xml' milestone_xml = requests.get(milestone_url, cookies=self.cookies).content xml_soup = making_soup(milestone_xml, 'xml') milestones_soup = xml_soup.findAll('milestone') milestones_path = os.path.join(self.path, 'milestones') if not os.path.exists(milestones_path): os.mkdir(milestones_path) milestones = list() if milestones_soup: for milestone in milestones_soup: ms = Milestone(milestone) ms_json = str(ms) milestones.append(ms_json) with open(os.path.join(milestones_path, ms.id) + '.json', 'w') as ms_file: ms_file.write(ms_json) return milestones
def __init__(self, project_name, dev_code, private): self.name = project_name self.url = self.NFORGE_URLS[dev_code] self.dev_code = dev_code self.project_url = '{0}/projects/{1}'.format(self.url, self.name) if dev_code or private: # Get cookies from COOKIES file self.cookies = dict() try: with open(self.COOKIE_FILE) as f: cookie_list = [cookie for cookie in f] for cookie in cookie_list: cookie_split = cookie.split('=') self.cookies[cookie_split[0]] = \ cookie_split[1].replace('\n', '') except EnvironmentError: raise InvalidCookieError(self.cookies) request_main_html = requests.get(self.project_url, cookies=self.cookies) self.project_main_html = request_main_html.content self.project_main_soup = making_soup(self.project_main_html, 'html') self.check_valid_project() project_type = 'dev_code' if dev_code else 'open_project' self.path = os.path.join(Nforge.__name__, project_type, project_name) self.issues_path = os.path.join(self.path, ISSUES_DIR) self.downloads_path = os.path.join(self.path, DOWNLOADS_DIR) self.attach_path = os.path.join(self.issues_path, 'raw', ISSUE_ATTACH_DIR) self.paths = [self.issues_path, self.downloads_path] # 폴더 구조 생성 make_dirs(self.issues_path) make_dirs(self.downloads_path) for sub_dir in self.SUB_DIRS: issue_data = os.path.join(self.issues_path, sub_dir) download_data = os.path.join(self.downloads_path, sub_dir) if not os.path.exists(issue_data): os.mkdir(issue_data) if not os.path.exists(download_data): os.mkdir(download_data) # get version control system information src_request = requests.get(self.project_url + '/src', cookies=self.cookies) src_soup = making_soup(src_request.content, 'html') src_title = src_soup.find('title').get_text() if not dev_code: # if cannot find all related div tags, it raises NoSrcError if src_soup.find('div', class_='code_contents'): self.vcs = 'svn' elif '로그인' in src_title or '오류' in src_title: raise NoSrcError else: self.vcs = 'git' self.urls = self.create_url()