Exemplo n.º 1
0
def transform_open_format(x):
    ''' Original format:
        (u'NutritionalAnarchy.com',
        {u'2nd type': u'',
        u'3rd type': u'',
        u'Source Notes (things to know?)': u'',
        u'type': u'unreliable'})
    '''

    urls = mongo_driver.get_url('opensources')
    if x[0] in urls:
        return

    template = {
        'Category': 'conspiracy',
        'Reference': 'http://mediabiasfactcheck.com/zero-hedge/',
        'Truthiness': 'MIXED',
        'url': 'http://www.zerohedge.com/'
    }

    out_dict = dict().fromkeys(template)
    out_dict['url'] = x[0]
    out_dict['Category'] = ', '.join(
        list(set([x[1][_] for _ in x[1].keys() if 'type' in _ and x[1][_]])))
    out_dict['Reference'] = 'http://www.opensources.co'

    mongo_driver.insert('opensources', out_dict)
Exemplo n.º 2
0
        def get_articles(article):
            article_data = {}
            article.url = article.url.strip()

            try:
                article.download()

                article.parse()
            except Exception as e:
                print(e)
                return

            if article.title:
                # try:
                # article.nlp()
                # except:
                # article_data['keywords'] = article.keywords
                article_data['title'] = article.title
                article_data['text'] = article.text
                article_data['flags'] = self.categories
                article_data['source'] = self.url
                article_data['url'] = article.url
                print(self.categories, '\t', article_data['source'],
                      article_data['title'])
                mongo_driver.insert('articles', article_data)
def transform_open_format(x):
    ''' Original format:
        (u'NutritionalAnarchy.com',
        {u'2nd type': u'',
        u'3rd type': u'',
        u'Source Notes (things to know?)': u'',
        u'type': u'unreliable'})
    '''

    urls = mongo_driver.get_url('opensources')
    if x[0] in urls:
        return

    template = {
        'Category': 'conspiracy',
        'Reference': 'http://mediabiasfactcheck.com/conspiracy-times/',
        'Truthiness': 'MIXED',
        'url': 'http://www.conspiracy-times.com/'
    }

    out_dict = dict().fromkeys(template)
    out_dict['url'] = x[0]
    out_dict['Category'] = ', '.join(
        list(set([x[1][_] for _ in x[1].keys() if 'type' in _ and x[1][_]])))
    out_dict['Reference'] = 'http://www.opensources.co'

    mongo_driver.insert('opensources', out_dict)
    def export_results(self):
        logger.debug("Exporting results")

        self.results.update({'Reference': self.page, 'Category': accumulator.cat})
        logger.debug(self.results)

        logger.debug("Saving results to mongo")
        mongo_driver.insert('media_bias', self.results)
Exemplo n.º 5
0
def merge(url):
    os_ = addDict(correct(url, 'os'))
    mb_ = addDict(correct(url, 'mb'))
    [os_.pop(_) for _ in ('_id', 'url')]
    [mb_.pop(_) for _ in ('_id', 'url')]

    merged_ = mb_ + os_
    merged_['url'] = url
    mongo_driver.insert('all_sources', merged_)
Exemplo n.º 6
0
    def export_results(self):

        self.results.update({
            'Reference': self.page,
            'Category': accumulator.cat
        })
        print(self.results)

        mongo_driver.insert('media_bias', self.results)
def merge(url):
    os_ = addDict(correct(url, 'os'))
    mb_ = addDict(correct(url, 'mb'))
    [os_.pop(_) for _ in ('_id', 'url')]
    [mb_.pop(_) for _ in ('_id', 'url')]

    merged_ = mb_ + os_
    merged_['url'] = url
    mongo_driver.insert('all_sources', merged_)
def merge(url):
    logger.debug("Merging sources for url %s" % url)
    os_ = addDict(correct(url, 'os'))
    mb_ = addDict(correct(url, 'mb'))
    [os_.pop(_) for _ in ('_id', 'url')]
    [mb_.pop(_) for _ in ('_id', 'url')]

    merged_ = mb_ + os_
    merged_['url'] = url
    mongo_driver.insert('all_sources', merged_)
 def build(self):
     self.newspaper_obj = newspaper.build(self.url,
                                          config=newspaper_config,
                                          request_timeout=3,
                                          number_threads=2)
     self.categories = self.source['Category']
     self.build_metadata()
     logger.info(
         f"found {self.newspaper_obj.size()} articles for {self.url}")
     assert self.newspaper_obj.size() == len(self.newspaper_obj.articles)
     self.get_articles_controller()
     mongo_driver.insert('source_logs', self.meta)
Exemplo n.º 10
0
    def __init__(self, source, n_articles=45):
        self._data = source
        self.url = self.test_https(source['url'].split('/')[0])
        self.categories = source['Category']
        self.n_articles = n_articles
        self.get_links()
        self.build_meta()
        print(self.url, self.categories)
        self.get_articles_controller()
        if self.source_obj.size() > 0:

            mongo_driver.insert('source_logs', self.meta)
Exemplo n.º 11
0
def main():
    for i, article in enumerate(article_feeder()):

        mongo_driver.insert(
            'articles_by_flag',
            {
                'article':
                article['title'] + ' ' + article['text'].replace('\n', ' '),
                # {'article': article['keywords'],
                'flag':
                curr_flag.val
            })
Exemplo n.º 12
0
    def build(self, source):
        self._data = source
        self.categories = source['Category']
        self.url = self.test_https(source['url'].split('/')[0])
        if self.url == False:
            return
        self.get_links()
        self.build_meta()
        print(self.url)
        self.get_articles_controller()
        if self.source_obj.size() > 0:

            mongo_driver.insert('source_logs', self.meta)
 def get_articles(article):
     article.download()
     article.parse()
     article_data = {}
     article.url = article.url.strip()
     if len(article.text.split()) > 200 and detect(
             article.text) == 'en':
         article_data['text'] = article.text
         article_data['title'] = article.title
         article_data['text'] = article.text
         article_data['flags'] = self.categories
         article_data['source'] = self.url
         article_data['url'] = article.url
         logger.info(
             f"{self.categories}    {article_data['source']} {article_data['title']}"
         )
         mongo_driver.insert('articles', article_data)
     else:
         logger.info(
             f"skipped article {article.title} due to insufficient length"
         )
""" This cleans all the scraped articles  """

import json

from helpers import LemmaTokenizer
import mongo_driver


def lemma_wrapper(dict_):
    dict_['article'] = LemmaTokenizer(dict_['text'])
    dict_.pop('text')
    return dict_


def flags_articles_gen():
    for i, _ in enumerate(mongo_driver.get_all('articles')):
        yield _


if __name__ == '__main__':
    mongo_driver.kill('articles_cleaned')
    mongo_driver.drop_articles()

    cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen())
    for i, cleaned_article in enumerate(cleaner_gen):
        mongo_driver.insert('articles_cleaned', cleaned_article)
        if not i % 100:
            print(i)
    json.dump(mongo_driver.db['articles_cleaned'].count(),
              open('n_articles.json', 'w'))
Exemplo n.º 15
0
    def export_results(self):

        self.results.update({'Reference': self.page, 'Category': accumulator.cat})
        print(self.results)

        mongo_driver.insert('media_bias', self.results)
Exemplo n.º 16
0
from helpers import LemmaTokenizer
import mongo_driver


def lemma_wrapper(dict_):

    dict_['article'] = LemmaTokenizer(dict_['article'])
    return dict_


def flags_articles_gen():
    for i, _ in enumerate(mongo_driver.get_all('articles_by_flag')):

        yield _


if __name__ == '__main__':
    mongo_driver.kill('articles_cleaned')
    mongo_driver.drop_articles()
    list(
        map(lambda _: mongo_driver.insert('articles_cleaned', _),
            (lemma_wrapper(doc) for doc in flags_articles_gen())))
Exemplo n.º 17
0
""" This cleans all the scraped articles  """

from helpers import LemmaTokenizer
import mongo_driver
import json


def lemma_wrapper(dict_):

    dict_['article'] = LemmaTokenizer(dict_['text'])
    dict_.pop('text')
    return dict_


def flags_articles_gen():

    for i, _ in enumerate(mongo_driver.get_all('articles')):
        yield _


if __name__ == '__main__':
    mongo_driver.kill('articles_cleaned')
    mongo_driver.drop_articles()

    cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen())
    for i, cleaned_article in enumerate(cleaner_gen):
        mongo_driver.insert('articles_cleaned', cleaned_article)
        if not i % 100:
            print(i)
    json.dump(mongo_driver.db['articles_cleaned'].count(), open('n_articles.json', 'w'))
Exemplo n.º 18
0
if __name__ == '__main__':
    mongo_driver.kill('all_sources')

    os_data = get_clean_urls('opensources')
    mb_data = get_clean_urls('media_bias')

    os_urls = set(os_data.keys())
    mb_urls = set(mb_data.keys())

    shared_urls = os_urls & mb_urls

    stats = {
        'individual': [len(os_urls), len(mb_urls)],
        'total': [len(os_urls) + len(mb_urls)],
        'not shared': len(os_urls ^ mb_urls),
        'shared': len(shared_urls),
        'total': len(os_urls | mb_urls),
        'opensource only': len(os_urls - mb_urls),
        'mediabias only': len(mb_urls - os_urls)
    }
    print(stats)

    [mongo_driver.insert('all_sources', correct(url, 'os')) for url in os_urls - mb_urls]
    [mongo_driver.insert('all_sources', correct(url, 'mb')) for url in mb_urls - os_urls]
    list(map(merge, shared_urls))

    x = sorted([_ for _ in mongo_driver.db['all_sources'].find().distinct('Category')])
    pprint(x)
    print(len(x))
    mb_urls = set(mb_data.keys())

    shared_urls = os_urls & mb_urls

    stats = {
        'individual': [len(os_urls), len(mb_urls)],
        'total': [len(os_urls) + len(mb_urls)],
        'not shared': len(os_urls ^ mb_urls),
        'shared': len(shared_urls),
        'total': len(os_urls | mb_urls),
        'opensource only': len(os_urls - mb_urls),
        'mediabias only': len(mb_urls - os_urls)
    }
    print(stats)

    [
        mongo_driver.insert('all_sources', correct(url, 'os'))
        for url in os_urls - mb_urls
    ]
    [
        mongo_driver.insert('all_sources', correct(url, 'mb'))
        for url in mb_urls - os_urls
    ]
    list(map(merge, shared_urls))

    x = sorted([
        _ for _ in mongo_driver.db['all_sources'].find().distinct('Category')
    ])
    pprint(x)
    print(len(x))