def parse_poem(text, year): lines = text.split('\n') poet_name = '' title = '' content = [] for line in lines: line = line.strip().replace('\ufffd', '') if '~ ~ ※ ~ ~' in line or module_reg.match(line): if title: write_poem(Profile(title=title, author=poet_name, href=''), '\r\n'.join(content) + '\r\n' + year) title = '' continue poet = poet_reg.findall(line) if len(poet): poet_name = poet[0] continue title_maybe = title_reg.findall(line) if len(title_maybe): print(title_maybe, title, len(content)) if title: write_poem(Profile(title=title, author=poet_name, href=''), '\r\n'.join(content) + '\r\n' + year) title = title_maybe[0] content = [] else: content.append(line)
def read_poem(href): poem_url = URL + href response = requests.get(poem_url) response.encoding = 'GB2312' text = response.text.replace('<br/>', '\n') soup = BeautifulSoup(text, "lxml") title_container = soup.find('td', class_='main_ArticleTitle') title = '' if title_container: title = title_container.text.strip() content_td = soup.find('td', id='fontzoom') if not content_td: print('No content: ' + poem_url) return content_p = content_td.find_all('p') lines = [] for p in content_p: lines.append('\r\n\r\n') p_contents = p.contents for p_c in p_contents: line = str(p_c).strip().replace('<br/>', '\r\n') lines.append(line) poem_content = ''.join(lines) # print(title, poem_url) write_poem(Profile(href=poem_url, author='顾城', title=title), poem_content)
def split(set_title, lines): title = '' author = '冰心' url = '' content = '' for line in lines: line = line.strip() new_t = TITLE_PATTERN.findall(line) if len(new_t): if title: write_poem(Profile(url, author, set_title + ':' + title), content) content = '' title = new_t[0].replace('—', '一') else: content = content + '\r\n' + line if title: write_poem(Profile(url, author, set_title + ':' + title), content)
def addProfile(self, author_profile): """Adds an author profile to the nanopublication""" print "Adding profile" # We add all triples from a Profile graph to the default graph of the nanopublication profile_graph = Profile(author_profile) self.publication.ingest(profile_graph) # We add an attribution relation between the nanopub assertion and the Profile author self.publication.pg.add( (self.publication.ag.identifier, PROV['wasAttributedTo'], profile_graph.author_uri)) self.publication.pig.add( (self.publication.uri, PROV['wasAttributedTo'], profile_graph.author_uri))
def resolve_poem(url, failed_urls): response = requests.get(url) if response.status_code is not 200: print('http error: url=' + url) return False text = response.text bad_author = is_not_modern_chinese_poet(text) if bad_author: print('not mofailed_urlsdern chinese poet: ' + bad_author) return poem_elements = POEM_PATTERN.findall(text) if not len(poem_elements): poem_elements = POEM_PATTERN1.findall(text) if not len(poem_elements): poem_elements = POEM_PATTERN2.findall(text) # print(text) # print(poem_elements) if len(poem_elements): poem = poem_elements[0] if len(poem) >= 3: title = resolve_title(poem[0]) content = resolve_content(poem[1]) author = poem[-1].split('<br />')[0].strip() # remove suffixes author = author.split('(')[0].split('(')[0].split(',')[0].split( ',')[0] if '·' in author or '•' in author or '[' in author or '[' in author: print('not mofailed_urlsdern chinese poet: ' + author) else: print('PARSED: ' + url + ' ' + title + ' @ ' + author) write_poem(Profile(href=url, author=author, title=title), content) else: print(len(poem)) print(poem) else: print("Parsed failed: " + url) if url not in failed_urls: failed_urls.append(url)
def main(): argv = sys.argv start_page = 1 if len(argv) > 1: try: start_page = int(argv[1]) except Exception: print("Invalid start page, will start from 1: " + argv[1]) page_total = get_total_page_num() print('page_total=' + str(page_total)) if start_page == 1: remove_tmp_all() for page_num in range(start_page, page_total + 1): failed_urls = read_poems(page_num) with open(os.path.join('tmp', 'failed_urls.txt'), 'a') as file: for f_url in failed_urls: file.write(f_url + '\r\n') print('Parsed page: ' + str(page_num)) write_poem(Profile(author='failed', title='urls', href=''), '\r\n'.join(failed_urls))
def parse_poem_profile_td(td): container = td.find('div') if container is None: container = td title_a = container.find('a') if title_a is None: # maybe appears on the last page return None href = title_a.get('href') title = title_a.get('title') title = title.replace('\r\n', '').replace('————', '——').replace(',', ',').replace( '(长诗节选)', '_长诗节选').strip() title_a.extract() # Wrong name 席慕蓉 author_text = container.text.replace('席慕蓉', '席慕容').strip() author = re.findall(r'(.*)\((\d*?)\)', author_text, re.S)[0][0] return Profile(href=href, title=title, author=author)