示例#1
0
def parse_poem(text, year):
    lines = text.split('\n')
    poet_name = ''
    title = ''
    content = []
    for line in lines:
        line = line.strip().replace('\ufffd', '')
        if '~ ~ ※ ~ ~' in line or module_reg.match(line):
            if title:
                write_poem(Profile(title=title, author=poet_name, href=''),
                           '\r\n'.join(content) + '\r\n' + year)
            title = ''
            continue
        poet = poet_reg.findall(line)
        if len(poet):
            poet_name = poet[0]
            continue
        title_maybe = title_reg.findall(line)

        if len(title_maybe):
            print(title_maybe, title, len(content))
            if title:
                write_poem(Profile(title=title, author=poet_name, href=''),
                           '\r\n'.join(content) + '\r\n' + year)
            title = title_maybe[0]
            content = []
        else:
            content.append(line)
示例#2
0
def read_poem(href):
    poem_url = URL + href

    response = requests.get(poem_url)
    response.encoding = 'GB2312'
    text = response.text.replace('<br/>', '\n')

    soup = BeautifulSoup(text, "lxml")
    title_container = soup.find('td', class_='main_ArticleTitle')
    title = ''
    if title_container:
        title = title_container.text.strip()
    content_td = soup.find('td', id='fontzoom')
    if not content_td:
        print('No content: ' + poem_url)
        return
    content_p = content_td.find_all('p')
    lines = []
    for p in content_p:
        lines.append('\r\n\r\n')
        p_contents = p.contents
        for p_c in p_contents:
            line = str(p_c).strip().replace('<br/>', '\r\n')
            lines.append(line)
    poem_content = ''.join(lines)

    # print(title, poem_url)
    write_poem(Profile(href=poem_url, author='顾城', title=title), poem_content)
def split(set_title, lines):
    title = ''
    author = '冰心'
    url = ''
    content = ''
    for line in lines:
        line = line.strip()
        new_t = TITLE_PATTERN.findall(line)
        if len(new_t):
            if title:
                write_poem(Profile(url, author, set_title + ':' + title),
                           content)
                content = ''
            title = new_t[0].replace('—', '一')
        else:
            content = content + '\r\n' + line
    if title:
        write_poem(Profile(url, author, set_title + ':' + title), content)
示例#4
0
文件: __init__.py 项目: raadjoe/COW
    def addProfile(self, author_profile):
        """Adds an author profile to the nanopublication"""

        print "Adding profile"
        # We add all triples from a Profile graph to the default graph of the nanopublication
        profile_graph = Profile(author_profile)
        self.publication.ingest(profile_graph)
        # We add an attribution relation between the nanopub assertion and the Profile author
        self.publication.pg.add(
            (self.publication.ag.identifier, PROV['wasAttributedTo'],
             profile_graph.author_uri))
        self.publication.pig.add(
            (self.publication.uri, PROV['wasAttributedTo'],
             profile_graph.author_uri))
def resolve_poem(url, failed_urls):
    response = requests.get(url)
    if response.status_code is not 200:
        print('http error: url=' + url)
        return False
    text = response.text
    bad_author = is_not_modern_chinese_poet(text)

    if bad_author:
        print('not mofailed_urlsdern chinese poet: ' + bad_author)
        return
    poem_elements = POEM_PATTERN.findall(text)

    if not len(poem_elements):
        poem_elements = POEM_PATTERN1.findall(text)
    if not len(poem_elements):
        poem_elements = POEM_PATTERN2.findall(text)

    # print(text)
    # print(poem_elements)
    if len(poem_elements):
        poem = poem_elements[0]
        if len(poem) >= 3:
            title = resolve_title(poem[0])
            content = resolve_content(poem[1])
            author = poem[-1].split('<br />')[0].strip()
            # remove suffixes
            author = author.split('(')[0].split('(')[0].split(',')[0].split(
                ',')[0]

            if '·' in author or '•' in author or '[' in author or '[' in author:
                print('not mofailed_urlsdern chinese poet: ' + author)
            else:
                print('PARSED: ' + url + ' ' + title + ' @ ' + author)
                write_poem(Profile(href=url, author=author, title=title),
                           content)
        else:
            print(len(poem))
            print(poem)

    else:
        print("Parsed failed: " + url)
        if url not in failed_urls:
            failed_urls.append(url)
def main():
    argv = sys.argv
    start_page = 1
    if len(argv) > 1:
        try:
            start_page = int(argv[1])
        except Exception:
            print("Invalid start page, will start from 1: " + argv[1])
    page_total = get_total_page_num()
    print('page_total=' + str(page_total))

    if start_page == 1:
        remove_tmp_all()
    for page_num in range(start_page, page_total + 1):
        failed_urls = read_poems(page_num)
        with open(os.path.join('tmp', 'failed_urls.txt'), 'a') as file:
            for f_url in failed_urls:
                file.write(f_url + '\r\n')
        print('Parsed page: ' + str(page_num))
    write_poem(Profile(author='failed', title='urls', href=''),
               '\r\n'.join(failed_urls))
示例#7
0
def parse_poem_profile_td(td):
    container = td.find('div')
    if container is None:
        container = td

    title_a = container.find('a')

    if title_a is None:
        # maybe appears on the last page
        return None

    href = title_a.get('href')
    title = title_a.get('title')
    title = title.replace('\r\n', '').replace('————',
                                              '——').replace(',', ',').replace(
                                                  '(长诗节选)', '_长诗节选').strip()
    title_a.extract()
    # Wrong name 席慕蓉
    author_text = container.text.replace('席慕蓉', '席慕容').strip()
    author = re.findall(r'(.*)\((\d*?)\)', author_text, re.S)[0][0]

    return Profile(href=href, title=title, author=author)