Exemplo n.º 1
0
    def extract_links(self, html):
        for anchor in html.findAll('a'):
            uri = urlparse(anchor['href'])

            if uri.netloc in ('neupy.com', ''):
                url = urljoin(self.url, uri.path)
                if uri.fragment:
                    url = url + "#" + uri.fragment

                yield Link(uri=url, text=anchor.text)
Exemplo n.º 2
0
    data = []

    logging.info("Collecting documents")
    all_documents = collect_documents(SITE_DIR)

    logging.info("Define relations between documents")
    webgraph = WebPageGraph.create_from_documents(all_documents)

    for document in all_documents:
        logging.debug('Processing "%s"', document.uri)

        text = document.text
        text = text.lower().replace('.', ' ').replace('=', ' ')

        anchor_texts = []
        for _, link in webgraph.page_linked_by(Link(document.uri)):
            if link.text:
                anchor_texts.append(link.text)

        text = ' '.join([text] + anchor_texts)

        for term in nltk.word_tokenize(text):
            if term not in vocabulary:
                vocabulary[term] = len(vocabulary)

            termid = vocabulary[term]
            term_frequency[termid] += 1

            indeces.append(termid)
            data.append(1)