示例#1
0
def test_get_url_info_1(connection):
    '''
    tests that get_url_info can handle very long urls
    '''

    # FIXME: the database should be parameterized somehow
    #engine = sqlalchemy.create_engine('postgresql:///novichenkobot')
    #connection = engine.connect()
    url = 'https://www.nytimes.com/' + 'absurd_path' * 1000
    get_url_info(connection, url)
    return True
示例#2
0
def test_get_url_info_2(connection):
    '''
    tests that get_url_info returns the same url_id when called twice on
    a new randomly generated url
    '''

    # FIXME: the database should be parameterized somehow
    #engine = sqlalchemy.create_engine('postgresql:///novichenkobot')
    #connection = engine.connect()
    url = 'https://www.example.com/' + str(random.randrange(100000000000))
    url_info1 = get_url_info(connection, url)
    url_info2 = get_url_info(connection, url)
    return url_info1['id_urls'] == url_info2['id_urls']
示例#3
0
def test_get_url_info_3(connection):
    '''
    '''
    url_info = {
        'scheme': 'https',
        'hostname': 'media1.faz.net',
        'port': -1,
        'path':
        '/ppmedia/aktuell/wirtschaft/2406223479/1.6105689/article_aufmacher_klein/ist-amazon-zu-maechtig.jpg',
        'params': '',
        'query': '',
        'fragment': '',
        'other': '',
        'depth': 7
    }
    url = urlinfo2url(url_info)

    # FIXME: the database should be parameterized somehow
    #engine = sqlalchemy.create_engine('postgresql:///novichenkobot')
    #connection = engine.connect()
    get_url_info(connection, url)
    return True
示例#4
0
def insert_tweet(connection, tweet):
    '''
    Inserts the tweet into the database.

    Args:
        connection: a sqlalchemy connection to the postgresql db
        tweet: a dictionary representing the json tweet object
    '''
    # skip tweet if already inserted
    sql = sqlalchemy.sql.text('''
    SELECT id_tweets 
    FROM twitter.tweets
    WHERE id_tweets = :id_tweets
        ''')
    res = connection.execute(sql, {
        'id_tweets': tweet['id'],
    })
    if res.first() is not None:
        return

    # insert tweet
    with connection.begin() as trans:

        ########################################
        if tweet['user']['url'] is None:
            tweet_user_url = {}
        else:
            tweet_user_url = get_url_info(connection, tweet['user']['url'])

        # create/update the user
        sql = sqlalchemy.sql.text('''
        INSERT INTO twitter.users
            (id_users,created_at,updated_at,screen_name,name,location,id_urls,hostname,description,protected,verified,friends_count,listed_count,favourites_count,statuses_count,withheld_in_countries)
            VALUES
            (:id_users,:created_at,:updated_at,:screen_name,:name,:location,:id_urls,:hostname,:description,:protected,:verified,:friends_count,:listed_count,:favourites_count,:statuses_count,:withheld_in_countries)
            ON CONFLICT (id_users) DO
            UPDATE SET
                created_at=:created_at,
                updated_at=:updated_at,
                screen_name=:screen_name,
                name=:name,
                location=:location,
                id_urls=:id_urls,
                hostname=:hostname,
                description=:description,
                protected=:protected,
                verified=:verified,
                friends_count=:friends_count,
                listed_count=:listed_count,
                favourites_count=:favourites_count,
                statuses_count=:statuses_count,
                withheld_in_countries=:withheld_in_countries
                WHERE :updated_at > twitter.users.updated_at OR twitter.users.updated_at is null

            ''')
        res = connection.execute(
            sql, {
                'id_users':
                tweet['user']['id'],
                'created_at':
                tweet['user']['created_at'],
                'updated_at':
                tweet['created_at'],
                'screen_name':
                remove_nulls(tweet['user']['screen_name']),
                'name':
                remove_nulls(tweet['user']['name']),
                'location':
                remove_nulls(tweet['user']['location']),
                'id_urls':
                tweet_user_url.get('id_urls', None),
                'hostname':
                tweet_user_url.get('hostname', None),
                'description':
                remove_nulls(tweet['user']['description']),
                'protected':
                tweet['user']['protected'],
                'verified':
                tweet['user']['verified'],
                'friends_count':
                tweet['user']['friends_count'],
                'listed_count':
                tweet['user']['listed_count'],
                'favourites_count':
                tweet['user']['favourites_count'],
                'statuses_count':
                tweet['user']['statuses_count'],
                'withheld_in_countries':
                tweet['user'].get('withheld_in_countries', None),
            })

        ########################################

        try:
            #geo = [[ (tweet['geo']['coordinates'][1],tweet['geo']['coordinates'][0]) ]]
            geo_coords = tweet['geo']['coordinates']
            geo_coords = str(tweet['geo']['coordinates'][0]) + ' ' + str(
                tweet['geo']['coordinates'][1])
            geo_str = 'POINT'
        except TypeError:
            try:
                geo_coords = '('
                for i, poly in enumerate(
                        tweet['place']['bounding_box']['coordinates']):
                    if i > 0:
                        geo_coords += ','
                    geo_coords += '('
                    for j, point in enumerate(poly):
                        geo_coords += str(point[0]) + ' ' + str(point[1]) + ','
                    geo_coords += str(poly[0][0]) + ' ' + str(poly[0][1])
                    geo_coords += ')'
                geo_coords += ')'
                geo_str = 'MULTIPOLYGON'
            except KeyError:
                if tweet['user']['geo_enabled']:
                    raise ValueError('couldnt find geolocation information')

        try:
            text = tweet['extended_tweet']['full_text']
        except:
            text = tweet['text']

        try:
            country_code = tweet['place']['country_code'].lower()
        except TypeError:
            country_code = None

        if country_code == 'us':
            state_code = tweet['place']['full_name'].split(
                ',')[-1].strip().lower()
            if len(state_code) > 2:
                state_code = None
        else:
            state_code = None

        try:
            place_name = tweet['place']['full_name']
        except TypeError:
            place_name = None

        # created unhydrated references
        if tweet.get('in_reply_to_user_id', None) is not None:
            sql = sqlalchemy.sql.text('''
            INSERT INTO twitter.users
                (id_users,screen_name)
                VALUES
                (:id_users,:screen_name)
            ON CONFLICT DO NOTHING
                ''')
            res = connection.execute(
                sql, {
                    'id_users': tweet['in_reply_to_user_id'],
                    'screen_name': tweet['in_reply_to_screen_name'],
                })

        # insert the tweet
        sql = sqlalchemy.sql.text(f'''
        INSERT INTO twitter.tweets
            (id_tweets,id_users,created_at,in_reply_to_status_id,in_reply_to_user_id,quoted_status_id,geo,retweet_count,quote_count,favorite_count,withheld_copyright,withheld_in_countries,place_name,country_code,state_code,lang,text,source)
            VALUES
            (:id_tweets,:id_users,:created_at,:in_reply_to_status_id,:in_reply_to_user_id,:quoted_status_id,ST_GeomFromText(:geo_str || '(' || :geo_coords || ')'),:retweet_count,:quote_count,:favorite_count,:withheld_copyright,:withheld_in_countries,:place_name,:country_code,:state_code,:lang,:text,:source)
        ON CONFLICT DO NOTHING;
            ''')
        res = connection.execute(
            sql, {
                'id_tweets': tweet['id'],
                'id_users': tweet['user']['id'],
                'created_at': tweet['created_at'],
                'in_reply_to_status_id': tweet.get('in_reply_to_status_id',
                                                   None),
                'in_reply_to_user_id': tweet.get('in_reply_to_user_id', None),
                'quoted_status_id': tweet.get('quoted_status_id', None),
                'geo_coords': geo_coords,
                'geo_str': geo_str,
                'retweet_count': tweet.get('retweet_count', None),
                'quote_count': tweet.get('quote_count', None),
                'favorite_count': tweet.get('favorite_count', None),
                'withheld_copyright': tweet.get('withheld_copyright', None),
                'withheld_in_countries': tweet.get('withheld_in_countries',
                                                   None),
                'place_name': place_name,
                'country_code': country_code,
                'state_code': state_code,
                'lang': tweet.get('lang'),
                'text': remove_nulls(text),
                'source': remove_nulls(tweet.get('source', None)),
            })

        ########################################

        try:
            urls = tweet['extended_tweet']['entities']['urls']
        except KeyError:
            urls = tweet['entities']['urls']

        for url in urls:
            url_info = get_url_info(connection, url['expanded_url'])
            if url_info is None:
                id_urls = None
                hostname = None
            else:
                id_urls = url_info['id_urls']
                hostname = url_info['hostname']
            sql = sqlalchemy.sql.text('''
            INSERT INTO twitter.tweet_urls
                (id_tweets,id_urls,hostname)
                VALUES
                (:id_tweets,:id_urls,:hostname)
            ON CONFLICT DO NOTHING
                ''')
            res = connection.execute(sql, {
                'id_tweets': tweet['id'],
                'id_urls': id_urls,
                'hostname': hostname
            })

        ########################################

        try:
            mentions = tweet['extended_tweet']['entities']['user_mentions']
        except KeyError:
            mentions = tweet['entities']['user_mentions']

        for mention in mentions:
            sql = sqlalchemy.sql.text('''
            INSERT INTO twitter.users
                (id_users,name,screen_name)
                VALUES
                (:id_users,:name,:screen_name)
            ON CONFLICT DO NOTHING
                ''')
            res = connection.execute(
                sql, {
                    'id_users': mention['id'],
                    'name': remove_nulls(mention['name']),
                    'screen_name': remove_nulls(mention['screen_name']),
                })

            sql = sqlalchemy.sql.text('''
            INSERT INTO twitter.tweet_mentions
                (id_tweets,id_users)
                VALUES
                (:id_tweets,:id_users)
            ON CONFLICT DO NOTHING
                ''')
            res = connection.execute(sql, {
                'id_tweets': tweet['id'],
                'id_users': mention['id']
            })

        ########################################

        try:
            hashtags = tweet['extended_tweet']['entities']['hashtags']
            cashtags = tweet['extended_tweet']['entities']['symbols']
        except KeyError:
            hashtags = tweet['entities']['hashtags']
            cashtags = tweet['entities']['symbols']

        tags = ['#' + hashtag['text'] for hashtag in hashtags
                ] + ['$' + cashtag['text'] for cashtag in cashtags]

        for tag in tags:
            sql = sqlalchemy.sql.text('''
            INSERT INTO twitter.tweet_tags
                (id_tweets,tag)
                VALUES
                (:id_tweets,:tag)
            ON CONFLICT DO NOTHING
                ''')
            res = connection.execute(sql, {
                'id_tweets': tweet['id'],
                'tag': remove_nulls(tag)
            })

        ########################################

        try:
            media = tweet['extended_tweet']['extended_entities']['media']
        except KeyError:
            try:
                media = tweet['extended_entities']['media']
            except KeyError:
                media = []

        for medium in media:
            url_info = get_url_info(connection, medium['media_url'])
            sql = sqlalchemy.sql.text('''
            INSERT INTO twitter.tweet_media
                (id_tweets,id_urls,hostname,type)
                VALUES
                (:id_tweets,:id_urls,:hostname,:type)
            ON CONFLICT DO NOTHING
                ''')
            res = connection.execute(
                sql, {
                    'id_tweets': tweet['id'],
                    'id_urls': url_info['id_urls'],
                    'hostname': url_info['hostname'],
                    'type': medium['type']
                })
示例#5
0
    def parse(self, response):

        # only parse html pages
        if not isinstance(response, HtmlResponse):
            return

        # process the downloaded webpage
        article = html2article(response.url, response.body)
        all_links = self.le.extract_links(response)

        # insert article into database
        with self.connection.begin() as trans:

            # insert into articles table
            url_info = get_url_info(
                self.connection,
                article.canonical_link,
                depth=response.request.depth,
            )
            id_urls_canonical = url_info['id_urls']
            sql = sqlalchemy.sql.text('''
                INSERT INTO articles 
                (id_urls,hostname,id_urls_canonical,id_responses,title,text,html,lang,pub_time) 
                values 
                (:id_urls,:hostname,:id_urls_canonical,:id_responses,:title,:text,:html,:lang,:pub_time)
                returning id_articles
            ''')
            res = self.connection.execute(
                sql, {
                    'id_urls': response.request.id_urls,
                    'hostname': response.request.hostname,
                    'id_urls_canonical': id_urls_canonical,
                    'id_responses': response.id_responses,
                    'title': article.title,
                    'text': article.text,
                    'html': article.article_html,
                    'lang': article.lang,
                    'pub_time': article.publish_date,
                })
            id_articles = res.first()[0]

            # update keywords table
            keywords_lang = self.keywords.get(article.lang, [])
            alltext_lower = article.alltext.lower()
            text_lower = article.text.lower()
            title_lower = article.title.lower()
            keywords_alltext = sum(
                [alltext_lower.count(keyword) for keyword in keywords_lang])
            keywords_text = sum(
                [text_lower.count(keyword) for keyword in keywords_lang])
            keywords_title = sum(
                [title_lower.count(keyword) for keyword in keywords_lang])
            sql = sqlalchemy.sql.text('''
            INSERT INTO keywords
                (id_articles,keyword,num_title,num_text,num_alltext)
                VALUES
                (:id_articles,:keyword,:num_title,:num_text,:num_alltext)
            ''')
            res = self.connection.execute(
                sql, {
                    'id_articles': id_articles,
                    'keyword': 'north korea',
                    'num_title': keywords_title,
                    'num_text': keywords_text,
                    'num_alltext': keywords_alltext,
                })

            # update authors table
            for author in article.authors:
                sql = sqlalchemy.sql.text('''
                    INSERT INTO authors (id_articles,author) values (:id_articles,:author)
                ''')
                self.connection.execute(sql, {
                    'id_articles': id_articles,
                    'author': author,
                })

            # update refs table
            refs = []
            refs.append([article.top_image, 'top_image', ''])
            for url in article.images:
                refs.append([url, 'image', ''])
            for url in article.movies:
                refs.append([url, 'movie', ''])
            for url in all_links:
                refs.append([url.url, 'link', url.text])
            for (url, url_type, text) in refs:
                target_url_info = get_url_info(self.connection,
                                               url,
                                               depth=response.request.depth +
                                               1)
                if target_url_info is not None:
                    target = target_url_info['id_urls']
                    sql = sqlalchemy.sql.text('''
                    insert into refs
                        (source,target,type,text)
                        values
                        (:source,:target,:type,:text);
                    ''')
                    self.connection.execute(
                        sql, {
                            'source': id_articles,
                            'target': target,
                            'type': url_type,
                            'text': text[:2048],
                        })

        # yield all links
        for link in all_links:

            # calculate keywords in link text
            keywords_link = sum(
                [link.text.count(keyword) for keyword in keywords_lang])

            # create the request
            r = scrapy.http.Request(url=link.url)
            r.priority = 10000 * keywords_link + 100 * keywords_title + keywords_text + keywords_alltext
            r.meta.update(link_text=link.text)
            r.depth = response.request.depth + 1

            yield r
示例#6
0
sys.path.append('.')
from Novichenko.Bot.sqlalchemy_utils import get_url_info, insert_request
from Novichenko.Bot.spiders.general_spider import GeneralSpider

# create database connection
engine = sqlalchemy.create_engine(args.db)
connection = engine.connect()

# add seeds
with open(args.csv) as f:

    reader = csv.DictReader(f)
    for row in reader:

        # extract hostname from row
        url_info = get_url_info(connection, row['URL'])
        hostname = url_info['hostname']
        #if hostname[:4]=='www.':
        #hostname=hostname[4:]
        print(f'adding {hostname}')

        # add seed URLS into the frontier
        if args.add_seeds and row['PRIORITY'] != 'ban':
            urls = [
                #'http://'+hostname+'/'+url_info['path'],
                #'http://www.'+hostname+'/'+url_info['path'],
                #'https://'+hostname+'/'+url_info['path'],
                #'https://www.'+hostname+'/'+url_info['path'],
            ]
            if len(url_info['path']) > 1:
                urls += [