def assess_doc_cred(doc, cfg): """Main credibility assessment for a single doc :param doc: a validated and normalised document, ready for credibility assessment :param cfg: any configs we need to execute/customise the assessment :returns: a credibility assessment for the doc :rtype: dict """ start = citimings.start() if content.is_tweet_doc(doc): result = tweet_credrev.review(doc, cfg) return result elif content.is_article_doc(doc): result = article_credrev.review(doc, cfg) return result else: rev_format = cfg.get('acred_review_format', 'schema.org') msg = 'Unsupported document (not a %s))' % supported_doc_types if rev_format == 'cred_assessment': return { '@context': ci_context, '@type': 'DocumentCredibilityAssessment', 'doc_url': doc['url'], 'item_assessed': doc, 'cred_assessment_error': msg, 'date_assessed': isodate.now_utc_timestamp(), 'timings': citimings.timing('assess_doc_cred', start), 'credibility': 0, 'confidence': 0, 'explanation': msg} else: rating = { '@type': 'Rating', 'ratingValue': 0.0, 'confidence': 0.0, 'ratingExplanation': msg} result = { '@context': ci_context, '@type': 'DocumentCredReview', 'reviewAspect': 'credibility', 'itemReviewed': doc, 'dateCreated': isodate.now_utc_timestamp(), 'author': bot_info([], cfg), 'reviewRating': { **rating, 'identifier': itnorm.calc_identifier(rating, cfg)} } return { **result, 'identifier': itnorm.calc_identifier(result, cfg) }
def similarSent_as_SentSimilarityReview(simSent, simResult, cfg): qSent = simResult['q_claim'] simReviewer = simResult['simReviewer'] simVal = simSent['similarity'] return { '@context': 'http://coinform.eu', '@type': 'SentSimilarityReview', 'itemReviewed': content.as_dbq_sentpair(dbSent=simSent['sentence'], qSent=qSent, cfg=cfg), 'headline': simlabel.claim_rel_str(simVal, None), 'reviewRating': { '@type': 'Rating', 'reviewAspect': 'similarity', 'ratingValue': simVal }, 'dateCreated': simResult.get('dateCreated', isodate.now_utc_timestamp()), 'author': simReviewer }
def calc_domain_credibility(domain, cfg={}): """Calculates a `DomainCredibility` for a domain via MisinfoMe Note that `DomainCredibility` is deprecated, use the `review` method which produces a `WebSiteCredReview` instead. :param domain: str e.g. `www.snopes.com` :returns: a `DomainCredibility` :rtype: dict """ if domain is None: return default_domain_crediblity( domain, "Default credibility for unknown domain") else: assert type(domain) == str, 'Expecting str, but was %s' (type(domain)) start = citimings.start() try: return { **misinfome_source_credibility(domain), '@context': 'DomainCredibility', '@type': 'DomainCredibility', 'dateCreated': isodate.now_utc_timestamp(), 'timings': citimings.timing('misinfome_source_credibility', start) } except Exception as e: logger.error("Failed misinfome source credibility. " + str(e)) return default_domain_crediblity( domain, "Unable to retrieve credibility assessment")
def review_article(adoc, cfg): """Main credibility review for a single article Refactoring of `assess_article_cred` :param adoc: analyzed doc as returned by `analyzed_doc` :param cfg: config to guide this assessment :returns: a `ArticleCredReview` :rtype: dict """ # TODO: ? start = citimings.start() domcredReview = adoc_to_website_credReview(adoc, cfg) content_credReview = review_doc_content_cred(adoc, cfg) # TODO: ? extract sub_bots from website_cr and aggqsent_cr and make sure it matches default_sub_bots? agg_rating = aggregate_subReviews(domcredReview, content_credReview, adoc, cfg) return { **base_ArticleCredReview(cfg), 'author': default_bot_info(cfg), 'dateCreated': isodate.now_utc_timestamp(), 'itemReviewed': adoc, # maybe just return ref? 'text': '%s seems *%s* %s' % (markdown_ref_for_article( adoc, cfg), credlabel.rating_label(agg_rating, cfg), agg_rating.get('ratingExplanation', '(missing explanation)')), 'reviewRating': agg_rating, 'isBasedOn': [domcredReview, content_credReview] }
def websiteCredRev_as_qclaimCredRating(websiteCredRev, cfg): wscr = websiteCredRev result = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'reviewCount': dictu.get_in(wscr, ['reviewRating', 'reviewCount'], 0), 'ratingCount': dictu.get_in(wscr, ['reviewRating', 'ratingCount'], 0), 'ratingValue': dictu.get_in(wscr, ['reviewRating', 'ratingValue'], 0.0), 'dateCreated': isodate.now_utc_timestamp() } if is_by_factchecker(websiteCredRev, cfg): # reduce domain credibility for fact-checkers, as we want to # focus on their claim reviews even if their confidence is # relatively low. # Refactoring of website_credrev.penalise_credibility penalty = float(cfg.get('factchecker_website_to_qclaim_confidence_penalty_factor', 0.5)) return { **result, 'confidence': dictu.get_in(wscr, ['reviewRating', 'confidence'], 0.0) * penalty, 'ratingExplanation': "as it was published in site `%s`. %s %s" % ( dictu.get_in(websiteCredRev, ['itemReviewed', 'name']), websiteCredRev.get('text', '(Explanation for website credibility missing)'), "However, the site is a factchecker so it publishes sentences with different credibility values.") } else: return { **result, 'confidence': dictu.get_in(wscr, ['reviewRating', 'confidence'], 0.0), 'ratingExplanation': "as it was published on site `%s`. %s" % ( dictu.get_in(websiteCredRev, ['itemReviewed', 'name']), websiteCredRev.get('text', '(Explanation for website credibility missing)')) }
def base_ArticleCredReview(cfg): return { '@context': content.ci_context, '@type': 'ArticleCredReview', 'additionalType': content.super_types('ArticleCredReview'), 'dateCreated': isodate.now_utc_timestamp(), }
def search_claim(q_claim): """finds similar claims or sentences in a claim database Finding similar claims or sentences in the co-inform claim database # noqa: E501 :param q_claim: This should be an English sentence or claim. Multiple sentences are not allowed. :type q_claim: str :rtype: dict """ if type(q_claim) is str: q_claims = [q_claim] if type(q_claim) is list: q_claims = q_claim if q_claim is None: raise InvalidUsage("Claim is mandatory") start = citimings.start() logger.info('Searching semantic vector space for %s claim(s)' % len( q_claims)) topn = 5 preds, claim_ids, simReviewer = search_semantic_vecspace(q_claims, topn=topn) search_semspace_t = citimings.timing('search_semantic_vecspace', start) assert len(preds) == len(claim_ids) assert len(q_claims) == len(preds) q_resp, claim_retrieve_t = retrieve_result_claims(claim_ids, q_claims, topn) start3 = citimings.start() results, sub_build_ts = [], [] for i in range(len(q_claims)): start4 = citimings.start() claim_id2pred = {idx: float(pred) for idx, pred in zip( claim_ids[i], preds[i])} relsents, sub_ts = q_resp_to_related_sent( q_resp, claim_id2pred) qclaim = q_claims[i] results.append({ '@context': ci_context, '@type': 'SemanticClaimSimilarityResult', 'dateCreated': isodate.now_utc_timestamp(), 'q_claim': qclaim, 'simReviewer': simReviewer, 'results': relsents}) sub_build_ts.append(citimings.timing('build_result', start4, sub_ts)) result_build_t = citimings.timing('build_results', start3, sub_build_ts) results, stance_pred_t = add_stance_detection( results, sim_threshold=stance_min_sim_threshold) timing = citimings.timing( 'search_claim', start, [search_semspace_t, claim_retrieve_t, result_build_t, stance_pred_t]) return { 'results': results, 'resultsHeader': { 'QTime': timing['total_ms'], 'timings': timing, 'params': { 'claim': q_claim }}}
def base_AggQSentCredReview(cfg): return { '@context': ci_context, '@type': 'AggQSentCredReview', 'additionalType': ['CredibilityReview', 'Review'], 'dateCreated': isodate.now_utc_timestamp(), 'author': default_bot_info(cfg) # default sub_bots }
def claimsim_result_as_claimcred(claimsim_result, cfg): """Convert a `SemanticClaimSimilarityResult` into a `ClaimCredibility` :param claimsim_results: :param cfg: :returns: :rtype: """ # TODO: delegate to reviewers to convert claimsim_result into # QSentCredReview, DBClaimCredibilityReview, WebSiteCredReview, etc. agg_start = citimings.start() qsent = claimsim_result['q_claim'] # qsent relsents = claimsim_result['results'] # simsents # sentSimReviews = [ # TODO: remove, just for feedback during refactoring # semsent_simrev.similarSent_as_SentSimilarityReview(simSent, claimsim_result, cfg) # for simSent in relsents] for rs in relsents: # claim search no longer does domain credibility, so we have to do it here if 'domain_credibility' not in rs: rs['domain_credibility'] = website_credrev.calc_domain_credibility( rs['domain']) relsents = [add_relative_credibility(rs, cfg) for rs in relsents] cred_dict = aggregate_credibility(relsents, cfg) cred_dict['source'] = 'credibility of %d related claims ' % len(relsents) agg_t = citimings.timing('claim_relsent_agg', agg_start) return { '@context': ci_context, '@type': 'ClaimCredibility', 'claim': qsent, 'item_assessed': { '@context': ci_context, '@type': 'Claim', 'claim': qsent }, # 'sentenceSimilarityReview': sentSimReviews, 'aggQSentCredReview': claimsim_result_as_aggQSentCredReview(claimsim_result, cfg), 'related_claims': _partition_related_sents(relsents, cfg), 'date_assessed': isodate.now_utc_timestamp(), 'assessor': { '@context': ci_context, '@type': 'CredibilityAssessor', 'name': 'SemanticSimilarityClaimCredibilityAssessor', 'version': '20200208' }, 'credibility': cred_dict, 'timings': agg_t }
def normalise(claimReview, cfg): if claimReview is None: return None assert content.is_claimReview(claimReview), "%s" % (claimReview) sub_ratings = normalised_claimReview_ratings(claimReview) most_confident = agg.select_most_confident_rating(sub_ratings) if most_confident is None: agg_rating = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'reviewCount': 1, # the original claimReview 'ratingCount': len(sub_ratings), 'ratingValue': 0.0, 'confidence': 0.0, 'ratingExplanation': 'Failed to interpret original [review](claimReview,get("url", "missing_url"))' } else: agg_rating = { **most_confident, '@type': 'AggregateRating', 'reviewCount': 1, 'ratingCount': len(sub_ratings) } assert type(agg_rating['confidence']) == float assert 'ratingExplanation' in agg_rating, '%s' % (most_confident) return { '@context': ci_context, '@type': 'NormalisedClaimReview', 'additionalType': content.super_types('NormalisedClaimReview'), 'author': bot_info(cfg), 'text': 'Claim `%s` is *%s* %s' % (claimReview.get('claimReviewed'), credlabel.rating_label(agg_rating, cfg), agg_rating.get('ratingExplanation', '(missing explanation)')), 'claimReviewed': claimReview.get('claimReviewed'), 'dateCreated': isodate.now_utc_timestamp(), 'isBasedOn': [claimReview] + sub_ratings, 'reviewAspect': 'credibility', 'reviewRating': agg_rating }
def from_old_DomainCredibility(dom_cred, cfg): """Converts a `DomainCredibility` into a `WebSiteCredReview` :param dom_cred: a `DomainCredibility` dict :param cfg: configuration options :returns: a `WebSiteCredReview` :rtype: dict """ domain_url = dom_cred.get('itemReviewed', 'missing_website') # str itemReviewed = content.str_as_website(domain_url) # reconstruct WebSite ratingVal = dictu.get_in(dom_cred, ['credibility', 'value'], 0.0) explanation = 'based on %d review(s) by external rater(s)%s' % (len( dom_cred['assessments']), example_raters_markdown(dom_cred)) return { '@context': 'http://coinform.eu', '@type': 'WebSiteCredReview', 'additionalType': content.super_types('WebSiteCredReview'), 'itemReviewed': itemReviewed, 'text': 'Site `%s` seems *%s* %s' % (itemReviewed.get('name', '??'), credlabel.describe_credval(ratingVal, None), explanation), 'author': misinfoMeSourceCredReviewer(), 'reviewRating': { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': ratingVal, 'confidence': dictu.get_in(dom_cred, ['credibility', 'confidence'], 0.5), 'ratingExplanation': explanation, 'reviewCount': len(dom_cred['assessments']), 'ratingCount': len(dom_cred['assessments']) }, 'dateCreated': dom_cred.get('dateCreated', isodate.now_utc_timestamp()), 'reviewAspect': 'credibility', 'isBasedOn': [], # TODO: 'isBasedOn_assessments': dom_cred['assessments'], 'timings': dom_cred.get('timings', {}) }
def assess_article_cred(article, cfg): """Main credibility assessment for a single article *Deprecated* you should move to `review_article` :param article: valid and normalised article :param cfg: config to guide this assessment :returns: a credibility assessment for the article :rtype: dict """ start = citimings.start() adoc = analyzed_doc(article, cfg) adoc_t = adoc['timings'] domcred = adoc_to_domain_cred(adoc, cfg) content_cred = assess_doc_content_cred(adoc, cfg) agg_cred = aggregate_article_cred(domcred, content_cred, cfg) return { '@context': content.ci_context, '@type': 'ArticleCredibilityAssessment', 'doc_url': article['url'], 'item_assessed': article, 'date_asessed': isodate.now_utc_timestamp(), 'assessor': { '@context': content.ci_context, '@type': 'CredibilityAssessor', 'name': 'ArticleCredibilityAssessor', 'version': '20200207' }, 'doc_resolved_url': adoc.get('resolved_url', adoc.get('url')), 'analyzed_doc': adoc, **agg_cred, 'sub_assessments': [domcred, content_cred], 'timings': citimings.timing('assess_article_cred', start, [ adoc_t, domcred.get('timings', None), content_cred.get('timings', None) ]) # 'claims_in_doc': claim_creds, # 'domain_credibility': domcred, # 'content_credibility': content_cred }
def dummyPrediction(tweet): start = citimings.start() return { '@context': ci_context, '@type': 'TweetCredibilityAssessment', 'tweet_id': int(tweet['tweet_id']), 'item_assessed': tweet, 'credibility': random.random(), 'confidence': 0.0, 'explanation': 'Dummy prediction, no actual analysis performed.', 'sub_assessments': [], 'date_assessed': isodate.now_utc_timestamp(), 'assessor': {'@context': ci_context, 'name': 'dummyCredibilityPredictor'}, 'timings': citimings.timing('dummyPrediction', start) # deprecated, now as sub_assessments # 'sentences_in_tweets': [], # 'sentences_linked': [] }
def worthinesspreds_as_SentCheckWorthinessReview(mapped_pred, config): result = { "@context": ci_context, "@type": "SentCheckWorthinessReview", "additionalType": content.super_types('SentCheckWorthinessReview'), 'reviewAspect': 'checkworthiness', 'itemReviewed': content.as_sentence(mapped_pred['sentence']), 'reviewRating': { '@type': 'Rating', 'reviewAspect': 'checkworthiness', 'ratingValue': mapped_pred['ratingValue'], 'confidence': mapped_pred['confidence'], 'ratingExplanation': rating_exp(mapped_pred['ratingValue'], mapped_pred['sentence']) }, 'dateCreated': isodate.now_utc_timestamp(), "author": checkWorthinessReviewer(config) } result['identifier'] = calc_worth_review_id(result) return result
def aggregate_subReviews(db_Sentence, claimReview, webSiteCred, cfg): """Aggregates (claim and WebSite) reviews about a DB Sentence into a credibility review :param db_Sentence: a `Sentence` in the Co-inform database :param claimReview: a `ClaimReview` for the db_Sentence. May be None if no claim review is available for the sentence. In general, the claim review will not have been normalised (i.e. mapped onto the co-inform accuracy/credibility scales) :param webSiteCred: a `WebSiteCredReview` for a webSite where the `db_Sentence` was published. :param cfg: configuration options :returns: a `DBSentCredReview` :rtype: dict """ nClaimReview = crn.normalise(claimReview, cfg) if nClaimReview is None: nClaimReview = {} nWebSiteRating = None if webSiteCred: nWebSiteRating = websiteCredRev_as_qclaimCredRating(webSiteCred, cfg) assert type(nWebSiteRating['confidence']) == float assert type(dictu.get_in(nClaimReview, ['reviewRating', 'confidence'], 0.0)) == float subRatings = [nWebSiteRating, nClaimReview.get('reviewRating', None)] subRatings = [r for r in subRatings if r is not None] sel_rating = agg.select_most_confident_rating(subRatings) or { 'ratingValue': 0.0, 'confidence': 0.0, 'ratingExplanation': 'No website or claimReview associated with this sentence' } isBasedOn = [webSiteCred, nClaimReview] isBasedOn = [ibo for ibo in isBasedOn if ibo is not None and ibo != {}] reviewCount = agg.total_reviewCount(subRatings) + len(isBasedOn) ratingCount = agg.total_ratingCount(subRatings) # should be a superset of [ibo.get('author') for ibo in isBasedOn] sub_bots = default_sub_bots(cfg) appears_in_docs = db_Sentence.get('appearance', []) appears_in_doc = appears_in_docs[0] if appears_in_docs else None link_to_doc = md_link_to_doc(appears_in_doc) revRating = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'reviewCount': reviewCount, 'ratingCount': ratingCount, 'ratingValue': sel_rating.get('ratingValue', 0.0), 'confidence': sel_rating.get('confidence', 0.0), 'ratingExplanation': sel_rating.get('ratingExplanation') } return { '@context': "http://coinform.eu", '@type': "DBSentCredReview", 'additionalType': content.super_types('DBSentCredReview'), 'itemReviewed': db_Sentence, 'text': 'Sentence `%s` %sseems *%s* %s' % ( db_Sentence.get('text', '??'), ', in %s, ' % (link_to_doc) if link_to_doc else '', credlabel.rating_label(revRating, cfg), sel_rating.get('ratingExplanation') ), 'reviewRating': revRating, 'reviewAspect': 'credibility', 'isBasedOn': isBasedOn, 'dateCreated': isodate.now_utc_timestamp(), 'author': bot_info(sub_bots, cfg) }
def aggregate_subReviews(subReviews, tweet, cfg): """Creates an aggregate review based on subReviews for tweet Refactoring of `aggregate_tweet_cred` :param subReviews: list of credibility reviews for (parts of) the tweet to review. :param cfg: config options :returns: a credibility review for the `tweet` to review that contains an `AggregateRating` based on the `subReviews` :rtype: dict """ # extract sub_bots and compare to default_sub_bots partial_TweetCredReview = { '@context': ci_context, '@type': 'TweetCredReview', 'itemReviewed': tweet, 'isBasedOn': subReviews, 'dateCreated': isodate.now_utc_timestamp(), 'author': default_bot_info(cfg) } tweet_mdref = markdown_ref_for_tweet(tweet, cfg) if subReviews is None: subReviews = [] subRatings = [ sr.get('reviewRating') for sr in subReviews if sr.get('reviewRating') is not None ] # filter by min confidence conf_threshold = float(cfg.get('cred_conf_threshold', 0.7)) filter_fn = agg.filter_review_by_min_confidence(conf_threshold) conf_subRevs = [sr for sr in subReviews if filter_fn(sr)] igno_subRevs = [sr for sr in subReviews if not filter_fn(sr)] # no (confident) subReviews if len(conf_subRevs) == 0: part_rating = { '@type': 'Rating', 'ratingValue': 0.0, 'confidence': 0.0, 'reviewAspect': 'credibility' } if len(subReviews) == 0: msg = "we could not extract (or assess credibility of) its sentences or linked documents" % ( tweet_mdref) rating = {**part_rating, 'ratingExplanation': msg} else: msg = 'we could not assess the credibility of its %d sentences or linked documents.%s' % ( len(subReviews), '\nFor example:\n * %s' % (igno_subRevs[0]['text'])) rating = { **part_rating, '@type': 'AggregateRating', 'ratingExplanation': msg, 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + len(subReviews) } return { **partial_TweetCredReview, 'text': '%s seems *%s* as %s' % (tweet_mdref, credlabel.rating_label(rating, cfg), msg), 'reviewRating': rating } # select least credible subReview subRevs_by_val = sorted([sr for sr in conf_subRevs], key=lambda rev: dictu.get_in( rev, ['reviewRating', 'ratingValue'], 0.0)) least_cred_rev = subRevs_by_val[0] msg = 'based on its least credible part:\n%s' % (dictu.get_in( least_cred_rev, ['text'], '(missing explanation for part)')) revRating = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': dictu.get_in(least_cred_rev, ['reviewRating', 'ratingValue'], 0.0), 'confidence': dictu.get_in(least_cred_rev, ['reviewRating', 'confidence'], 0.0), 'ratingExplanation': msg, 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + len(subReviews) } return { **partial_TweetCredReview, 'isBasedOn': subRevs_by_val + igno_subRevs, # just a re-ordering 'text': '%s seems *%s* %s' % (tweet_mdref, credlabel.rating_label(revRating, cfg), msg), 'reviewRating': revRating }
def aggregate_subReviews(simple_sentSimReview, stanceReview, cfg): """Aggregates a similarity and stance review into a polar similarity review :param simple_sentSimReview: a (non-polar) `SentSimilarityReview` for a `sentPair` :param stanceReview: a `SentStanceReview` for the same `sentPair` as `simple_sentSimReview` :param cfg: configuration options :returns: a `SentPolarSimilarityReview` :rtype: dict """ assert simple_sentSimReview is not None if stanceReview is None: return simple_sentSimReview sim = dictu.get_in(simple_sentSimReview, ['reviewRating', 'ratingValue']) sent_stance = dictu.get_in(stanceReview, ['reviewRating', 'ratingValue'], 'unrelated') stance_conf = dictu.get_in(stanceReview, ['reviewRating', 'confidence'], '0.5') sent_pair = simple_sentSimReview['itemReviewed'] assert stanceReview['itemReviewed'] == sent_pair, '%s != %s' % ( stanceReview['itemReviewed'], sent_pair) agg_sim = calc_agg_polarsim(sim=sim, sent_stance=sent_stance, sent_stance_conf=stance_conf, cfg=cfg) sub_reviews = [ sr for sr in [simple_sentSimReview, stanceReview] if sr is not None ] sub_ratings = [ srev.get('reviewRating') for srev in sub_reviews if srev.get('reviewRating') is not None ] headline = simlabel.claim_rel_str(sim, sent_stance) # TODO: more than an explanation this is the review body # the explanation would be that one model said the sentences were x similar # while another said they were (stance) explanation = 'Sentence `%s` %s `%s`' % (dictu.get_in( sent_pair, ['sentA', 'text' ]), headline, dictu.get_in(sent_pair, ['sentB', 'text'])) sub_bots = [ simple_sentSimReview.get('author', {}), stanceReview.get('author', {}) ] return { '@context': 'http://coinform.eu', '@type': 'SentPolarSimilarityReview', 'additionalType': content.super_types('SentPolarSimilarityReview'), 'itemReviewed': sent_pair, 'headline': headline, 'reviewAspect': 'polarSimilarity', 'reviewBody': explanation, 'reviewRating': { '@type': 'AggregateRating', 'reviewAspect': 'polarSimilarity', 'ratingValue': agg_sim, 'confidence': stance_conf, 'reviewCount': len(sub_reviews), 'ratingCount': agg.total_ratingCount(sub_ratings), 'ratingExplanation': explanation }, 'isBasedOn': sub_reviews, 'dateCreated': isodate.now_utc_timestamp(), 'author': bot_info(sub_bots, cfg) }