示例#1
0
def test_add_node():
    graph = Graph()
    graph.add_node(Node('a'))

    assert Node('a') in graph._adjacency_list.keys()
    assert len(graph._adjacency_list.keys()) == 1
    assert len(graph._adjacency_list[Node('a')]) == 0
示例#2
0
def test_get_nodes():
    graph = Graph()
    graph.add_node(Node('a'))
    graph.add_node(Node('b'))

    nodes = graph.get_nodes()
    assert len(nodes) == 2
    assert Node('a') in nodes and Node('b') in nodes
示例#3
0
def test_add_duplicate_edge():
    graph = Graph()
    graph.add_edge(Node('a'), Node('b'))
    graph.add_edge(Node('a'), Node('b'))

    assert Node('a') in graph._adjacency_list.keys()
    assert Node('b') in graph._adjacency_list.keys()
    assert len(graph._adjacency_list.keys()) == 2
    assert len(graph._adjacency_list[Node('a')]) == 1
    assert len(graph._adjacency_list[Node('b')]) == 0
示例#4
0
def test_get_connected_from():
    graph = Graph()
    graph.add_edge(Node('a'), Node('b'))
    graph.add_edge(Node('a'), Node('c'))

    assert graph.get_nodes_count() == 3
    assert graph.get_edges_count() == 2

    connected_from = graph.get_connected_from(Node('a'))
    assert len(connected_from) == 2
    assert Node('b') in connected_from
    assert Node('c') in connected_from
示例#5
0
def test_rank():
    sentence_list = sorted([
        'This is a sentence.', 'A quick brown fox.',
        'Jumped over the lazy dog.'
    ])

    sentence_nodes = sentences.rank(
        [Node(s, score=sentences.DEFAULT_NODE_SCORE) for s in sentence_list])
    sentence_nodes = sorted(sentence_nodes, key=lambda n: n.data)

    # Here we expect that each sentence node will have the same score. This is
    # because the 'clean' (non-stop) words in each sentence have no similarity
    # with any other sentence.
    for i in range(len(sentence_list)):
        assert sentence_nodes[i].data == sentence_list[i]
        assert sentence_nodes[i].score == sentences.DEFAULT_NODE_SCORE
示例#6
0
def test_get_nodes_count():
    graph = Graph()
    graph.add_node(Node('a'))

    assert graph.get_nodes_count() == 1
示例#7
0
文件: test_node.py 项目: jamo95/Newsy
def test_get_averaged_score_data():
    node = Node('a', score=10)

    assert len(node.variations) == 0
    assert node.get_averaged_score() == 10.0
示例#8
0
文件: test_node.py 项目: jamo95/Newsy
def test_get_averaged_scorevariations():
    node = Node('a', score=10)
    node.variations = ['v1', 'v2']

    assert node.get_averaged_score() == 5.0
示例#9
0
文件: test_node.py 项目: jamo95/Newsy
def test_equality():
    node_a = Node('a')
    node_b = Node('a')

    assert node_a == node_b
示例#10
0
文件: test_node.py 项目: jamo95/Newsy
def test_remove_variation():
    node = Node('a')
    node.variations = ['v1', 'v2']
    node.remove_variation('v1')

    assert set(node.variations) == set(['v2'])
示例#11
0
文件: test_node.py 项目: jamo95/Newsy
def test_add_variation():
    node = Node('a')
    node.add_variation('v')

    assert set(node.variations) == set(['v'])
示例#12
0
文件: test_node.py 项目: jamo95/Newsy
def test_inequality():
    node_a = Node('a')
    node_b = Node('b')

    assert node_a != node_b
示例#13
0
文件: dl.py 项目: jamo95/Newsy
def dl_techcrunch(session, stemmer, year, month, day):
    base_url = 'https://techcrunch.com/'
    archive_url = '{}{}/{}/{}'.format(base_url, year, month, day)

    # Download links for the articles.
    response = requests.get(archive_url)
    if not 200 <= response.status_code < 300:
        print('techcrunch failed to download archive page: status {}'.format(
            response.status_code))
        return False

    html = BeautifulSoup(response.text, 'html.parser')
    post_titles = html.select('.post-title')

    article_urls = []
    for post_title in post_titles:
        url = post_title.select('a')[0].attrs['href']
        if url.replace('www.', '').startswith(base_url):
            article_urls.append(url)

    # Download, summarise and store articles.
    for url in article_urls:
        # Check if we already have the article in the DB.
        if _get_article(session, normalize_url(url)):
            print('~ {}'.format(normalize_url(url)))
            continue

        tc_article = techcrunch.ArticleLoader.load(url)

        # Download the article content.
        try:
            article = _download_article(url)

            # Use for now until summaries and loaders are better.
            article.nlp()
        except newspaper.article.ArticleException:
            print('- {}'.format(normalize_url(url)))

        # Normalise and hash all the sentences to make finding the index more
        # accurate.
        text_sentences = [
            _hash_text(s) for s in tokenize_sentences(article.text)]

        # Conform data for entry into DB.
        summary_sentences = []
        keywords = [Node(w) for w in article.keywords]

        for tag in tc_article.get('tags', []):
            if tag not in article.keywords:
                keywords.append(Node(tag))

        for sentence in article.summary.split('\n'):
            index = text_sentences.index(_hash_text(sentence))
            summary_sentences.append(Node(sentence, index=index))

        # Insert article and summary into DB.
        dao.article.insert(
            session=session,
            text=tc_article.get('content', article.text),
            url=normalize_url(url),
            title=tc_article.get('title', article.title),
            keywords=keywords,
            sentences=summary_sentences,
            published_at=_format_timestamp(year, month, day),
            s_analysis=None,
        )
        print('+ {}'.format(url))
    return True
示例#14
0
文件: dl.py 项目: jamo95/Newsy
def dl_hackernoon(session, stemmer, year, month, day):
    base_url = 'https://hackernoon.com/'
    archive_url = '{}archive/{}/{}/{}'.format(base_url, year, month, day)

    # Download links for the articles.
    response = requests.get(archive_url)
    if not 200 <= response.status_code < 300:
        print('hackernoon failed to download archive page: status {}'.format(
            response.status_code))
        return False

    html = BeautifulSoup(response.text, 'html.parser')
    post_title_list = html.select('div.js-postStream')

    article_urls = []
    if post_title_list:
        for anchor in post_title_list[0].select('a'):
            url = anchor.attrs['href']
            if url.replace('www.', '').startswith(base_url):
                if 'source=collection_archive' in url:
                    url = url.split('?')[0]
                    match = re.match('[a-z0-9]+', url.split('-')[-1])
                    if match and url not in article_urls and '@' not in url:
                        article_urls.append(url)

    # Download, summarise and store articles.
    for url in article_urls:
        # Check if we already have the article in the DB.
        if _get_article(session, normalize_url(url)):
            print('~ {}'.format(normalize_url(url)))
            continue

        # Download the article content.
        try:
            article = _download_article(url)

            # Use for now until summaries and loaders are better.
            article.nlp()
        except newspaper.article.ArticleException:
            print('- {}'.format(normalize_url(url)))

        hn_article = hackernoon.ArticleLoader.load(url)

        # Normalise and hash all the sentences to make finding the index more
        # accurate.
        text_sentences = [
            _hash_text(s) for s in tokenize_sentences(article.text)]

        # Conform data for entry into DB.
        summary_sentences = []
        keywords = [Node(w) for w in article.keywords]

        for sentence in article.summary.split('\n'):
            index = text_sentences.index(_hash_text(sentence))
            summary_sentences.append(Node(sentence, index=index))

        # Insert article and summary into DB.
        dao.article.insert(
            session=session,
            text=hn_article.get('content', article.text),
            url=normalize_url(url),
            title=hn_article.get('title', article.title),
            keywords=keywords,
            sentences=summary_sentences,
            published_at=_format_timestamp(year, month, day),
            s_analysis=None,
        )
        print('+ {}'.format(url))
    return True
示例#15
0
文件: graph.py 项目: jamo95/Newsy
    def has_node(self, data):
        '''Return true if there exists a node with the specified data.'''

        return Node(data) in self._adjacency_list
示例#16
0
def test_get_edges_count():
    graph = Graph()
    graph.add_edge(Node('a'), Node('b'))

    assert graph.get_nodes_count() == 2
    assert graph.get_edges_count() == 1
示例#17
0
def test_has_node():
    graph = Graph()
    graph.add_node(Node('a'))

    assert graph.has_node('a')
    assert not graph.has_node('b')
示例#18
0
文件: test_node.py 项目: jamo95/Newsy
def test_has_variation():
    node = Node('a')
    node.variations = ['v']

    assert node.has_variation('v')
    assert not node.has_variation('w')
示例#19
0
文件: test_node.py 项目: jamo95/Newsy
def test_get_variations():
    node = Node('a')
    node.variations = ['v']

    assert set(node.get_variations()) == set(['v'])
示例#20
0
def _summarize(text='',
               title='',
               url='',
               sentence_count=DEFAULT_SENTENCE_COUNT,
               suggestedKeywords=None,
               keyword_count=DEFAULT_KEYWORD_COUNT):

    article_data = {'title': title, 'text': text, 'url': url}

    if url:
        # Check if article is cached
        article = _get_summary(normalize_url(url))
        if article:
            if suggestedKeywords is not None:
                for word in suggestedKeywords:
                    newKeyword = Keyword(word, 1)
                    if newKeyword not in article.keywords:
                        article.keywords.insert(0, newKeyword)
                db.session.add(article)
                db.session.commit()

            if article.s_analysis is None:
                senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                    article.text)
                if senti_analysis_data is not None:
                    article.s_analysis = senti_analysis_data['label']
            return {
                'title': article.title,
                'text': article.text,
                'sentences':
                [s.data for s in article.sentences][:sentence_count],
                'keywords': [w.data for w in article.keywords][:keyword_count],
                's_analysis': article.s_analysis,
            }

        article = _get_article_from_url(url)
        article_data['text'] = article.text
        if 'techcrunch' in url:
            tc_article = techcrunch.ArticleLoader.load(url)
            article_data['title'] = tc_article['title']
            article_data['text'] = tc_article['content']
            article_data['published_at'] = tc_article['timestamp']
            senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                tc_article['content'])
            print(senti_analysis_data)
            if senti_analysis_data is not None:
                article_data['s_analysis'] = senti_analysis_data['label']
        elif 'wired' in url:
            wired_article = wired.ArticleLoader.load(url)
            article_data['title'] = wired_article['title']
            article_data['text'] = wired_article['content']
            article_data['published_at'] = wired_article['date']
            senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                wired_article['content'])
            if senti_analysis_data is not None:
                article_data['s_analysis'] = senti_analysis_data['label']
        elif 'hackernoon' in url:
            hackernoon_article = hackernoon.ArticleLoader.load(url)
            article_data['title'] = hackernoon_article['title']
            article_data['text'] = hackernoon_article['content']
            article_data['published_at'] = hackernoon_article['date']
            senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                hackernoon_article['content'])
            if senti_analysis_data is not None:
                article_data['s_analysis'] = senti_analysis_data['label']
        elif 'venturebeat' in url:
            venturebeat_article = venturebeat.ArticleLoader.load(url)
            article_data['title'] = venturebeat_article['title']
            article_data['text'] = venturebeat_article['content']
            article_data['published_at'] = venturebeat_article['date']
            senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                venturebeat_article['content'])
            if senti_analysis_data is not None:
                article_data['s_analysis'] = senti_analysis_data['label']
        elif 'news.com.au' in url:
            newsau_article = newsau.ArticleLoader.load(url)
            article_data['title'] = newsau_article['title']
            article_data['text'] = newsau_article['content']
            article_data['published_at'] = newsau_article['date']
            senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                newsau_article['content'])
            if senti_analysis_data is not None:
                article_data['s_analysis'] = senti_analysis_data['label']
        elif 'cricket' in url:
            ca_article = cricketau.ArticleLoader.load(url)
            article_data['title'] = ca_article['title']
            article_data['text'] = ca_article['content']
            print(ca_article['content'])
            article_data['published_at'] = ca_article['date']
            senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                ca_article['content'])
            if senti_analysis_data is not None:
                article_data['s_analysis'] = senti_analysis_data['label']
        else:
            article_data['title'] = article.title
            senti_analysis_data = sentiment.SentimentAnalysis.analyise(
                article.text)
            if senti_analysis_data is not None:
                article_data['s_analysis'] = senti_analysis_data['label']
    else:
        article_data['title'] = title
        senti_analysis_data = sentiment.SentimentAnalysis.analyise(text)
        if senti_analysis_data is not None:
            article_data['s_analysis'] = senti_analysis_data['label']

    sentence_nodes = []
    sentences = tokenize_sentences(article_data['text'])
    for i, data in enumerate(sentences):
        sentence_nodes.append(Node(data, index=i))

    ranked_sentences = sorted(rank_sentences(sentence_nodes),
                              key=lambda n: n.score,
                              reverse=True)

    keywords = rank_words(article_data['title'], article_data['text'])

    if suggestedKeywords is not None:
        for word in suggestedKeywords:
            newKeyword = Keyword(word, 1)
            if newKeyword not in article.keywords:
                article.keywords.insert(0, newKeyword)
            keywords.insert(0, newKeyword)
    if url:
        _insert_summary(title=article_data['title'],
                        text=article_data['text'],
                        url=normalize_url(url),
                        keywords=keywords,
                        sentences=ranked_sentences,
                        published_at=article_data.get('published_at'),
                        s_analysis=article_data['s_analysis'])

    return {
        'title': article_data['title'],
        'text': article_data['text'],
        'sentences': [node.data for node in ranked_sentences][:sentence_count],
        'keywords': [node.data for node in keywords][:keyword_count],
        's_analysis': article_data['s_analysis']
    }