Exemplo n.º 1
0
def getPageText(title, langcode, stoplist, logf):
    global last_api_request
    api_wait(last_api_request)
    last_api_request = datetime.datetime.now()
    resp = urllib.urlopen(API_HTML % (langcode, title))
    data = json.load(resp)
    if 'error' in data:
        print >>logf, '\tmissing page'
        raise MissingPage()
    p = data['parse']
    html = p['text']['*']
    if not html.strip():
        print >>logf, '\tempty HTML parse returned by API'
        raise EmptyHTML()

    soup = BeautifulSoup(html, 'lxml')
    [x.extract() for x in soup.findAll('div', 'toc')]# table of content
    [x.extract() for x in soup.findAll('ol', 'references')]
    [x.extract() for x in soup.findAll('div', 'navbox')]
    [x.extract() for x in soup.findAll('pre')]
    """[x.extract() for x in soup.findAll('table', 'notice noprint notice-todo')]
    [x.extract() for x in soup.findAll('table', 'plainlinks cmbox cmbox-content')]
    [x.extract() for x in soup.findAll('table', 'plainlinks tmbox tmbox-notice')]
    [x.extract() for x in soup.findAll('table', 'metadata plainlinks ambox ambox-speedy')]
    [x.extract() for x in soup.findAll('table', 'plainlinks cmbox cmbox-speedy')]
    [x.extract() for x in soup.findAll('table', 'metadata plainlinks ambox ambox-delete')]
    [x.extract() for x in soup.findAll('table', 'plainlinks noprint xambox xambox-type-notice')]
    [x.extract() for x in soup.findAll('table', 'plainlinks xambox xambox-type-notice')]
    [x.extract() for x in soup.findAll('table', 'plainlinks xambox xambox-type-content')]
    [x.extract() for x in soup.findAll('div', 'label_message')]
    [x.extract() for x in soup.findAll('div', 'boilerplate metadata')]
    [x.extract() for x in soup.findAll('div', 'noprint request box')]
    [x.extract() for x in soup.findAll('td', 'mbox-text')]"""

    if stoplist == 'None': #with stopwords_high = 0, stopwords_low = 0 language is ignored
        paragraphs = justext.justext(soup.encode('utf-8'), justext.get_stoplist('English'), stopwords_high = 0, stopwords_low = 0, no_headings=True, max_link_density=0.5)
    else:
        paragraphs = justext.justext(soup.encode('utf-8'), justext.get_stoplist(stoplist), no_headings=True, max_link_density=0.5, stopwords_low = 0.35, stopwords_high = 0.37)
    text = ''
    parSum = 0
    charSum = 0
    wordSum = 0
    for paragraph in paragraphs:
        if paragraph['cfclass'] == 'good' or paragraph['cfclass'] == 'neargood':
            line = paragraph.get('text')
            text += '<p>\n'
            text += line
            text += '\n</p>\n'
            parSum += 1
            wordSum += paragraph['word_count']
            charSum += len(line)
    if (parSum == 0):
        print >>logf, '\tempty prevert returned by jusText'
        raise EmptyJusText()
    print >>logf, '\t%d words' % wordSum
    print >>logf, '\t%d paragraphs' % parSum
    categories = ';'.join([d['*'].replace('"', '') for d in p['categories']])
    header = '<doc title="%s" categories="%s" translations="%d" paragraphs="%d" words="%d" chars="%d">\n' %\
            (title.decode('utf-8'), categories, len(p['langlinks']), parSum, wordSum, charSum)
    return header + text + '</doc>\n'
Exemplo n.º 2
0
    def test_no_titles(self):
        s='text and some <em>other</em> words <span class="class">that I</span> have in my head now '
        html_string = (
            '<html><body>'
            '<p>{0}</p>'
            '<p>text and some <em>other</em> words <span class="class">that I</span> have in my head now</p>'
            '<h2>Smaller header</h2>'
            '<p>footer</p>'
            '</body></html>'
        ).format(s*5)

        pars = justext(html_string, get_stoplist("English"))
        title = justitle(html_string, pars)
        assert title == None
Exemplo n.º 3
0
def extract_named_entities(article):
    paragraphs = justext(article.source_text.encode("utf-8"), "English")

    pprint(paragraphs)
    text = u" ".join(p["text"] for p in paragraphs 
                     if p["class"] == "good" and not p.get("heading"))
    sentences =  nltk.sent_tokenize(text)
    print sentences
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [nltk.pos_tag(sent) for sent in tokenized_sentences]
    named_entity_chunks = [nltk.ne_chunk(sent, binary=True) for sent in tagged_sentences]
    for chunk in named_entity_chunks:
        for pos in chunk:
            if isinstance(pos, tuple): continue
            yield u" ".join((w for (w,p) in pos))
Exemplo n.º 4
0
def extract_named_entities(article):
    paragraphs = justext(article.source_text.encode("utf-8"), "English")

    pprint(paragraphs)
    text = u" ".join(p["text"] for p in paragraphs
                     if p["class"] == "good" and not p.get("heading"))
    sentences = nltk.sent_tokenize(text)
    print sentences
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]
    tagged_sentences = [nltk.pos_tag(sent) for sent in tokenized_sentences]
    named_entity_chunks = [
        nltk.ne_chunk(sent, binary=True) for sent in tagged_sentences
    ]
    for chunk in named_entity_chunks:
        for pos in chunk:
            if isinstance(pos, tuple): continue
            yield u" ".join((w for (w, p) in pos))
Exemplo n.º 5
0
 def test_accepts_html_tree(self):
     try:
         justext(html_to_dom(html_string), get_stoplist("English"))
     except JustextMalformedInput:
         self.fail("justext wasn't supposed to raise JustextMalformedInput exception on string")
Exemplo n.º 6
0
 def test_no_heading(self):
     html_string = '<html><body><title>Header | site.com</title></body></html>'
     
     pars = justext(html_string, get_stoplist("English"))
     title = justitle(html_string, pars)
     assert title == "Header | site.com"