def init(cls, session, force_drop, ignore_inactive=False, force_inactive=False, ignore_redirected=False): configure_logging('init', console_level='INFO', file_level='WARNING') dt_before = datetime.utcnow() logging.info('Creating database tables:') if force_drop is True: logging.warning('Existed tables would be dropped and recreated!') Base.metadata.drop_all(ENGINE) else: logging.warning('Ignore existed tables') Base.metadata.create_all(ENGINE) logging.info('Inserting platforms if not exist') get_or_create_m(session, Platform, TWITTER_PLATFORM_DICT, fb_uk='name') get_or_create_m(session, Platform, WEB_PLATFORM_DICT, fb_uk='name') logging.info('Trying to load site data:') dc_file = join(HOAXY_HOME, 'domains_claim.txt') df_file = join(HOAXY_HOME, 'domains_factchecking.txt') site_file = join(HOAXY_HOME, 'sites.yaml') if isfile(dc_file) is True: logging.info('Claim domains %s found', dc_file) SiteCmd.load_domains(session, dc_file, site_type='claim', ignore_inactive=ignore_inactive, force_inactive=force_inactive, ignore_redirected=ignore_redirected) else: logging.info('Claim domains %s not found', dc_file) if isfile(df_file) is True: logging.info('Fact checking domains %s found', df_file) SiteCmd.load_domains(session, df_file, site_type='fact_checking', ignore_inactive=ignore_inactive, force_inactive=force_inactive, ignore_redirected=ignore_redirected) else: logging.info('Fact checking domains %s not found', df_file) if isfile(site_file) is True: logging.info('Site file %s found', site_file) SiteCmd.load_sites(session, site_file, ignore_inactive=ignore_inactive, force_inactive=force_inactive, ignore_redirected=ignore_redirected) else: logging.info('Site file %s not found', site_file) sites = session.query(Site.domain, Site.site_type, Site.base_url ).filter(or_( Site.created_at > dt_before, Site.updated_at > dt_before )).order_by(Site.id).all() logger.info("Added or updated sites are:\n %s", pprint.pformat(sites)) logger.info("Done.")
def load_domains(cls, session, fn, site_type, ignore_inactive=False, force_inactive=False, ignore_redirected=False, exclusive=False): if exclusive: # disable existing domains of the same site type ob_expr = Site.id.asc() msites = get_msites(session, fb_kw=None, ob_expr=ob_expr) for site in msites: if site.site_type == site_type: cls.disable_site(session, site) logger.info('Sending HTTP requests to infer base URLs ...') with open(fn, 'r') as f: site_tuples = [(n + 1, line) + parse_domain(line, site_type) for n, line in enumerate(f) if not is_comment_line(line)] invalid_flag = False inactive_flag = False redirected_flag = False for n, line, site, status in site_tuples: line = line.strip('\n') if status == 'invalid': invalid_flag = True logger.error('line %i %r, invalid domain', n, line) elif status == 'inactive': inactive_flag = True logger.warning('line %i %r, domain inactive!', n, line) elif status == 'redirected': redirected_flag = True logger.warning('line %i %r, domain redirected to %s!', n, line, site['base_url']) if invalid_flag is True or \ (inactive_flag is True and (ignore_inactive is False and force_inactive is False)) or \ (redirected_flag is True and ignore_redirected is False): logger.error("""Please fix the warnings or errors above! \ Edit domains, or use --ignore-redirected to handle redirected domains', \ or Use --ignore-inactive or --force-inactive to handle inactive domains""") raise SystemExit(2) for n, line, site, status in site_tuples: if status == 'inactive' and ignore_inactive is True: continue elif status == 'redirected' and ignore_redirected is True: continue else: site['is_enabled'] = True get_or_create_m( session, Site, site, fb_uk='domain', onduplicate='update') logger.debug('Insert or update site %s', site['domain'])
def add_site_tags(cls, session, msite, source, tags): """Add site_tags for a site.""" owned_tags = [(mt.name, mt.source) for mt in msite.site_tags] fb_uk = ['name', 'source'] for tag in tags: tag_data = dict(name=tag, source=source) if (tag, source) in owned_tags: logger.warning('Site %r already contains tag %r!', msite.name, tag_data) else: mtag = get_or_create_m(session, SiteTag, tag_data, fb_uk=fb_uk) msite.site_tags.append(mtag) logger.info('Added tag %r to Site %r', tag_data, msite.name) session.commit()
def run(cls, args): try: # print(args) args = cls.args_schema.validate(args) except SchemaError as e: sys.exit(e) session = Session() # make sure lucene be inited lucene.initVM() lucene.getVMEnv().attachCurrentThread() if args['--index'] is True: configure_logging( 'lucene.index', console_level=args['--console-log-level']) mgid = get_or_create_m( session, MetaInfo, data=dict( name='article_group_id_lucene_index', value='0', value_type='int', description='article.group_id used for lucene index'), fb_uk='name') if args['--mode'] == 'create': mgid.set_value(0) session.commit() logger.debug('Indexing started.. Getting articles..') q = """ SELECT DISTINCT ON (a.group_id) a.id, a.group_id, a.canonical_url, a.title, a.meta, a.content, coalesce(a.date_published, a.date_captured) AS pd, s.domain, s.site_type FROM article AS a JOIN site AS s ON s.id=a.site_id WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE AND a.group_id>:gid ORDER BY group_id, pd ASC """ articles_iter = session.execute( sqlalchemy.text(q).bindparams(gid=mgid.get_value())) cls.index(session, args['--mode'], articles_iter, mgid) elif args['--search'] is True: configure_logging( 'lucene.search', console_level=args['--console-log-level']) cls.search(args['--query'], args['--top']) else: print("Unrecognized command!") sys.exit(2)
def replace_site_tags(cls, session, msite, source, tags): """Replace old site_tags with new ones.""" adding_tags = [(t, source) for t in tags] owned_tags = [(mt.name, mt.source) for mt in msite.site_tags] for t in tags: if (t, source) not in owned_tags: tag_data = dict(name=t, source=source) fb_uk = ['name', 'source'] mtag = get_or_create_m(session, SiteTag, tag_data, fb_uk=fb_uk) msite.site_tags.append(mtag) for mt in msite.site_tags: if (mt.name, mt.source) not in adding_tags: session.delete(mt) session.commit() logger.info('Replace site tags for site %r from %r to %r', msite.name, owned_tags, adding_tags)
def parse(self, jd): """The main parse function. Parameters --------- jd : json Tweet json data. Procedures ---------- 1) validate `jd` 2) extract URL and hashtag from `jd` 3) insert into database """ logger.debug('Parsing one tweet, begin') # # validation # try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # # extract url, hashtag and associated tweet status id # urls_set = set() hashtags_set = set() entities_list = [] if 'entities' in jd: entities_list.append(jd['entities']) if 'quoted_status' in jd: q_jd = jd['quoted_status'] if 'entities' in q_jd: entities_list.append(q_jd['entities']) if 'retweeted_status' in jd: re_jd = jd['retweeted_status'] if 'entities' in re_jd: entities_list.append(re_jd['entities']) if 'quoted_status' in re_jd and\ 'entities' in re_jd['quoted_status']: entities_list.append(re_jd['quoted_status']['entities']) for entities in entities_list: if entities: self._parse_entities(entities, urls_set, hashtags_set) # This tweet should contain urls if len(urls_set) == 0 and self.save_none_url_tweet is False: logger.debug('No url found in %s, ignore!', tw_raw_id) return None # # Insert into database # # creating user logger.debug('creating user') muser = get_or_create_m(self.session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') # creating tweet logger.debug('creating tweet') mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser.id) self.session.add(mtweet) try: self.session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) self.session.rollback() return # creating urls logger.debug('creating urls') for url in urls_set: murl = get_or_create_murl(self.session, data=dict(raw=url), platform_id=self.platform_id) self.session.add(AssTweetUrl(tweet_id=mtweet.id, url_id=murl.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) self.session.rollback() # creating hashtags logger.debug('creating hashtags') for hashtag in hashtags_set: mhashtag = get_or_create_m(self.session, Hashtag, data=dict(text=hashtag), fb_uk='text') self.session.add( AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) self.session.rollback() # paring associate tweet q1 = """ INSERT INTO ass_tweet (id, retweeted_status_id, quoted_status_id, in_reply_to_status_id) SELECT id, CAST(json_data#>>'{retweeted_status, id}' AS BIGINT), CAST(json_data#>>'{quoted_status, id}' AS BIGINT), CAST(json_data->>'in_reply_to_status_id' AS BIGINT) FROM tweet WHERE id=:tweet_id """ q1 = text(q1).bindparams(tweet_id=mtweet.id) try: self.session.execute(q1) self.session.commit() except DataError as e: # Handle \u0000 exception that postgresql json do not support logger.warning(e) self.session.rollback() q2 = r""" UPDATE tweet SET json_data=regexp_replace( json_data::text, '\\u0000', '\\\\u0000', 'g')::json WHERE id=:tweet_id """ q2 = text(q2).bindparams(tweet_id=mtweet.id) self.session.execute(q2) self.session.commit() logger.warning('json_data is updated (\\u0000 to \\\\u0000)') self.session.execute(q1) self.session.commit() logger.debug('Parsing one tweet, done.')
def parse_new_one(self, jd, session, g_urls_map, g_uusers_set, g_edges_set): # validate jd jd = replace_null_byte(jd) try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # parsing, level 1 l_urls, l_mentions, l_hashtags = self._parse_l1(jd) if len(l_urls['union']) == 0 and self.save_none_url_tweet is False: logger.warning('Ignore tweet %r with no urls!', tw_raw_id) return None # saving, level 1 logger.debug('Saving this user ...') muser = get_or_create_m(session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') logger.debug('Saving this tweet ...') muser_id = muser.id mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser_id) session.add(mtweet) try: session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) session.rollback() return None mtweet_id = mtweet.id logger.debug('Saving AssTweet ...') retweeted_status_id = None quoted_status_id = None if 'quoted_status' in jd: quoted_status_id = jd['quoted_status']['id'] if 'retweeted_status' in jd: retweeted_status_id = jd['retweeted_status']['id'] in_reply_to_status_id = jd['in_reply_to_status_id'] session.add( AssTweet(id=mtweet_id, retweeted_status_id=retweeted_status_id, quoted_status_id=quoted_status_id, in_reply_to_status_id=in_reply_to_status_id)) try: session.commit() except IntegrityError as e: logger.warning(e) session.rollback() logger.debug('Saving urls ...') for u in l_urls['union']: if len(u) > MAX_URL_LEN: murl_id = -1 else: murl_id = get_or_create_murl(session, data=dict(raw=u), platform_id=self.platform_id).id # Saving AssTweetUrl session.add(AssTweetUrl(tweet_id=mtweet_id, url_id=murl_id)) try: session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) session.rollback() g_urls_map[u] = murl_id # creating hashtags logger.debug('creating hashtags ...') for hashtag in l_hashtags['union']: mhashtag = get_or_create_m(session, Hashtag, data=dict(text=hashtag), fb_uk='text') session.add( AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id)) try: session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) session.rollback() self._parse_l2(jd, l_urls, l_mentions, g_urls_map, g_uusers_set, g_edges_set)
def parse(self, jd): """The main parse function. Parameters --------- jd : json Tweet json data. Procedures ---------- 1) do roughly parsing to validate `jd` 2) carefully parsing and insert into database 3) other associations """ logger.debug('Parsing one tweet, begin ...') # # 1) do roughly parsing to validate the tweet # # 1-1) parsing necessary fields, if failed then it is not a valid tweet logger.debug('Replacing null byte if existing ...') jd = replace_null_byte(jd, self.fp) logger.debug('1) Roughly parsing ...') try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # 1-2) roughly parsing entities_list = [] quoted_status_id = None retweeted_status_id = None if 'entities' in jd: entities_list.append(jd['entities']) if 'quoted_status' in jd: quoted_jd = jd['quoted_status'] quoted_user_jd = jd['quoted_status']['user'] quoted_status_id = quoted_jd['id'] if 'entities' in quoted_jd: entities_list.append(quoted_jd['entities']) if 'retweeted_status' in jd: retweeted_jd = jd['retweeted_status'] retweeted_user_jd = jd['retweeted_status']['user'] retweeted_status_id = retweeted_jd['id'] if 'entities' in retweeted_jd: entities_list.append(retweeted_jd['entities']) in_reply_to_status_id = jd['in_reply_to_status_id'] in_reply_to_user_id = jd['in_reply_to_user_id'] in_reply_to_screen_name = jd['in_reply_to_screen_name'] urls_set = set() hashtags_set = set() mentions_set = set() for entities in entities_list: if entities: self._parse_entities(entities, urls_set, hashtags_set, mentions_set) # This tweet should contain urls if len(urls_set) == 0 and self.save_none_url_tweet is False: logger.warning('No url found in tweet %s, ignore!', tw_raw_id) return None # # 2) carefully parsing and saving into database # logger.debug('2) Carefully parsing and saving ...') logger.debug('2-0) Saving twitter_user raw_id=%s ...', user_raw_id) muser = get_or_create_m(self.session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') logger.debug('Saving this user into twitter_user_union as well ...') create_or_update_muser( self.session, data=dict(raw_id=user_raw_id, screen_name=jd['user']['screen_name'], followers_count=jd['user']['followers_count'], profile=jd['user'], updated_at=created_at)) # creating tweet logger.debug('2-0) Saving tweet raw_id=%s ...', tw_raw_id) if self.saved_tweet is True: mtweet = self.session.query(Tweet).filter_by( raw_id=tw_raw_id).one() else: mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser.id) self.session.add(mtweet) try: self.session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) self.session.rollback() return None tweet_id = mtweet.id # Saving all urls and mapping the saved id url_map = dict() logger.debug('2-0) Saving all urls and associating with tweet...') for url in urls_set: murl = get_or_create_murl(self.session, data=dict(raw=url), platform_id=self.platform_id) url_map[url] = murl.id # saving ass_tweet_url if self.saved_tweet is False: self.session.add( AssTweetUrl(tweet_id=tweet_id, url_id=url_map[url])) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) self.session.rollback() # 2-1) retweet, focusing on retweeted_status # edge direction: from retweeted_user to current user if retweeted_status_id is not None: logger.debug( '2-1-a) Saving the retweeted user into twitter_user_union ...') retweeted_user_id = retweeted_user_jd['id'] retweeted_screen_name = retweeted_user_jd['screen_name'] create_or_update_muser( self.session, data=dict(raw_id=retweeted_user_id, screen_name=retweeted_screen_name, followers_count=retweeted_user_jd['followers_count'], profile=retweeted_user_jd, updated_at=created_at)) # retweeted user has been saved above, should be removed from mentions try: mentions_set.remove((retweeted_user_id, retweeted_screen_name)) except KeyError as e: logger.warning('Tweet %r: retweeted user not in mentions', tw_raw_id) logger.debug('2-1-a) Saving edges for retweet ...') self._save_edges(url_map, retweeted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=retweeted_user_id, to_raw_id=user_raw_id, is_quoted_url=False, is_mention=False, tweet_type='retweet') # 2-2) reply, focusing on current status # edges direction: from current user to mentions if in_reply_to_status_id is not None: # mentioned users would be saved later logger.debug('2-1-b) Saving edges for reply ...') # in_reply_to_user self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=in_reply_to_user_id, is_quoted_url=False, is_mention=False, tweet_type='reply') # mentions for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id and to_raw_id != in_reply_to_user_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='reply') # 2-3) quote if quoted_status_id is not None: logger.debug( '2-1-c) Saving the quoted user into twitter_user_union ...') quoted_user_id = quoted_user_jd['id'] quoted_screen_name = quoted_user_jd['screen_name'] create_or_update_muser( self.session, data=dict(raw_id=quoted_user_id, screen_name=quoted_screen_name, followers_count=quoted_user_jd['followers_count'], profile=quoted_user_jd, updated_at=created_at)) # 2-3-1) retweeted quote, focusing on quoted_status # treated as retweet edge if retweeted_status_id is not None: logger.debug( '2-1-c) Saving edges for quoting part of retweet ...') self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=retweeted_user_jd['id'], to_raw_id=user_raw_id, is_quoted_url=True, is_mention=False, tweet_type='retweet') # 2-3-2) replied quote, focusing on quoted_status # treated as reply edge elif in_reply_to_status_id is not None: logger.debug( '2-1-c) Saving edges for quoting part of reply ...') # in_reply_to_user self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=in_reply_to_user_id, is_quoted_url=True, is_mention=False, tweet_type='reply') # mentions for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id and to_raw_id != in_reply_to_user_id: self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=True, is_mention=True, tweet_type='reply') # 2-3-3) pure quote else: logger.debug( '2-1-c) Saving edge for pure quote part of quote ...') self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=quoted_user_jd['id'], to_raw_id=user_raw_id, is_quoted_url=True, is_mention=False, tweet_type='quote') logger.debug( '2-1-c) Saving edges for original part of quote ...') for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='quote') # 2-4) original tweet if retweeted_status_id is None and in_reply_to_status_id is None\ and quoted_status_id is None and 'entities' in jd and\ 'user_mentions' in jd['entities']: logger.debug('2-1-d) Saving edges for original tweet ...') for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='origin') # saving all mentions ... logger.debug('3) Saving all mentions ...') # add the in_reply_to_user mentions_set.add((in_reply_to_user_id, in_reply_to_screen_name)) for user_raw_id, screen_name in mentions_set: create_or_update_muser(self.session, data=dict(raw_id=user_raw_id, screen_name=screen_name, updated_at=created_at)) # saving hashtags logger.debug('3) creating hashtags') if self.saved_tweet is False: for hashtag in hashtags_set: mhashtag = get_or_create_m(self.session, Hashtag, data=dict(text=hashtag), fb_uk='text') self.session.add( AssTweetHashtag(tweet_id=tweet_id, hashtag_id=mhashtag.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) self.session.rollback() # saving associate tweet logger.debug('3 Saving ass_tweet ...') if self.saved_tweet is False: create_m(self.session, AssTweet, data=dict(id=tweet_id, retweeted_status_id=retweeted_status_id, quoted_status_id=quoted_status_id, in_reply_to_status_id=in_reply_to_status_id)) logger.debug('Parsing one tweet, done.')