def bot_info(sub_bots, cfg): """Returns a description for this AggQSentCredReviewer :param cfg: :returns: :rtype: """ result = { '@context': ci_context, '@type': 'AggQSentCredReviewer', 'additionalType': content.super_types('AggQSentCredReviewer'), 'name': 'ESI Aggregate Query Sentence Credibility Reviewer', 'description': 'Reviews the credibility of a query setence by comparing it to semantically similar sentences in the Co-inform DB and the credibility of those.', 'author': bot_describer.esiLab_organization(), 'dateCreated': '2020-03-19T15:09:00Z', 'applicationCategory': ['Disinformation Detection'], 'softwareRequirements': ['python'], 'softwareVersion': version, 'executionEnvironment': bot_describer.inspect_execution_env(), 'isBasedOn': sub_bots, 'launchConfiguration': { 'acred_pred_claim_search_url': cfg.get('acred_pred_claim_search_url', 'http://localhost:8070/test/api/v1/claim/internal-search') } } return { **result, 'identifier': hashu.hash_dict(dictu.select_keys(result, content.ident_keys(result))) }
def bot_info(sub_bots, cfg): """Returns a description for this TweetCredReviewer :param sub_bots: a list of bot items used by this TweetCredReviewer :param cfg: config options :returns: a `TweetCredReviewer` item :rtype: dict """ result = { '@context': ci_context, '@type': 'TweetCredReviewer', 'additionalType': content.super_types('TweetCredReviewer'), 'name': 'ESI Tweet Credibility Reviewer', 'description': 'Reviews the credibility of a tweet by reviewing the sentences in the tweet and the (textual) documents linked by the tweet', 'author': bot_describer.esiLab_organization(), 'dateCreated': '2020-04-02T18:00:00Z', 'applicationCategory': ['Disinformation Detection'], 'softwareRequirements': ['python', 'nltk', 'Cogito'], 'softwareVersion': version, 'executionEnvironment': bot_describer.inspect_execution_env(), 'isBasedOn': sub_bots, 'launchConfiguration': {}, 'taskConfiguration': {} } return { **result, 'identifier': hashu.hash_dict(dictu.select_keys(result, content.ident_keys(result))) }
def bot_info(sub_bots, cfg): result = { '@context': ci_context, '@type': 'DBSentCredReviewer', 'name': 'ESI DB Sentence Credibility Reviewer', 'description': 'Estimates the credibility of a sentence in the Co-inform DB based on known ClaimReviews or websites where the sentence has been published.', 'additionalType': content.super_types('DBSentCredReviewer'), 'author': bot_describer.esiLab_organization(), 'dateCreated': dateCreated, 'softwareVersion': version, 'url': 'http://coinform.eu/bot/DBSentCredReviewer/%s' % version, 'applicationSuite': 'Co-inform', 'isBasedOn': sub_bots, 'launchConfiguration': { 'factchecker_website_to_qclaim_confidence_penalty_factor': float( cfg.get('factchecker_website_to_qclaim_confidence_penalty_factor', 0.5)), 'acred_factchecker_urls': cfg.get('acred_factchecker_urls', []) } } ident = hashu.hash_dict(dictu.select_keys( result, content.ident_keys(result))) return { **result, 'identifier': ident }
def bot_info(sub_bots, cfg): """Returns a description for this ArticleCredReviewer :param sub_bots: bot items used by this ArticleCredReviewer :param cfg: config options :returns: an `ArticleCredReviewer` :rtype: dict """ result = { '@context': content.ci_context, '@type': 'ArticleCredReviewer', 'additionalType': content.super_types('ArticleCredReviewer'), 'name': 'ESI Article Credibility Reviewer', 'description': 'Reviews the credibility of an article by (i) semantically analysing it to detect relevant claims (ii) getting credibility reviews for the claims and (iii) getting a credibility reviews for the site(s) that published the article.', 'author': bot_describer.esiLab_organization(), 'dateCreated': '2020-04-01T17:02:00Z', 'applicationCategory': ['Disinformation Detection'], 'softwareRequirements': ['python', 'Cogito'], 'softwareVersion': version, 'executionEnvironment': bot_describer.inspect_execution_env(), 'isBasedOn': sub_bots, 'launchConfiguration': { # any launch configs? }, 'taskConfiguration': { 'cred_conf_threshold': cfg.get('cred_conf_threshold', 0.7), 'max_claims_in_doc': int(cfg.get('max_claims_in_doc', 5)), 'relsents_in_colls': cfg.get('relsents_in_colls', [ 'generic', 'pilot-se', 'pilot-gr', 'pilot-at', 'factcheckers', 'fc-dev' ]), 'target_url_collect_coll': cfg.get('target_url_collect_coll', cfg.get('default_url_collect_coll', None)), 'acred_review_format': cfg.get('acred_review_format', 'schema.org') } } return { **result, 'identifier': hashu.hash_dict(dictu.select_keys(result, content.ident_keys(result))) }
def stance_reviewer(model_meta, in_dir): result = { '@context': 'http://coinform.eu', '@type': 'SentStanceReviewer', 'additionalType': sentStanceReviewer_schema['super_types'], 'name': 'ESI Sentence Stance Reviewer', 'description': 'Assesses the stance between two sentences (e.g. agree, disagree, discuss) it was trained and evaluated on FNC-1 achieving 92% accuracy.', 'author': bot_describer.esiLab_organization(), 'dateCreated': '2020-01-13T15:18:00Z', 'applicationCategory': ['NLP'], 'applicationSubCategory': ['Stance Detection'], 'applicationSuite': ['Co-inform'], 'softwareRequirements': [ 'python', 'pytorch', 'transformers', 'RoBERTaModel', 'RoBERTaTokenizer' ], 'softwareVersion': '0.1.1', 'executionEnvironment': { **bot_describer.inspect_execution_env(), 'cuda': torch.cuda.is_available() }, 'isBasedOn': [], 'launchConfiguration': { 'model': model_meta, 'model_config': bot_describer.path_as_media_object( os.path.join(in_dir, 'config.json')), 'pytorch_model': bot_describer.path_as_media_object( os.path.join(in_dir, 'pytorch_model.bin')) } } result['identifier'] = calc_stance_reviewer_id(result) return result
def bot_info(sub_bots, cfg): result = { '@context': ci_context, '@type': 'SentPolarityReviewer', 'name': 'ESI Sentence Polarity Reviewer', 'description': 'Estimates the polar similarity between two sentences', 'additionalType': content.super_types('SentPolarityReviewer'), 'softwareVersion': version, 'dateCreated': '2020-03-27T22:54:00Z', 'url': 'http://coinform.eu/bot/SentencePolarSimilarityReviewer/%s' % version, 'applicationSuite': 'Co-inform', 'author': bot_describer.esiLab_organization(), 'isBasedOn': sub_bots, 'launchConfiguration': {} } ident = hashu.hash_dict( dictu.select_keys(result, content.ident_keys(result))) return {**result, 'identifier': ident}
def worth_reviewer(model_meta, in_dir): result = { '@context': 'http://coinform.eu', '@type': 'SentCheckWorthinessReviewer', 'additionalType': sentWorthReviewer_schema['super_types'], 'name': 'ESI Sentence Worth Reviewer', 'description': 'Assesses the worthiness of a sentence: CFS (whorty) NCS (unwhorty). It was trained and evaluated on a group of different datasets (CBD+Poynter+Clef\'19T1) achieving 95% accuracy.', 'author': bot_describer.esiLab_organization(), 'dateCreated': '2020-05-08T15:18:00Z', 'applicationCategory': ['NLP'], 'applicationSubCategory': ['Check-worthiness'], 'applicationSuite': ['Co-inform'], 'softwareRequirements': [ 'python', 'pytorch', 'transformers', 'RoBERTaModel', 'RoBERTaTokenizer' ], 'softwareVersion': '0.1.0', 'executionEnvironment': { **bot_describer.inspect_execution_env(), 'cuda': torch.cuda.is_available() }, 'isBasedOn': [], 'launchConfiguration': { 'model': model_meta, 'model_config': bot_describer.path_as_media_object( os.path.join(in_dir, 'config.json')), 'pytorch_model': bot_describer.path_as_media_object( os.path.join(in_dir, 'pytorch_model.bin')) } } result['identifier'] = calc_worth_reviewer_id(result) return result
def bot_info(cfg): result = { '@context': ci_context, '@type': 'ClaimReviewNormalizer', 'name': 'ESI ClaimReview Credibility Normalizer', 'description': 'Analyses the alternateName and numerical rating value for a ClaimReview and tries to convert that into a normalised credibility rating', 'additionalType': content.super_types('ClaimReviewNormalizer'), 'author': bot_describer.esiLab_organization(), 'dateCreated': dateCreated, 'softwareVersion': version, 'url': 'http://coinform.eu/bot/ClaimReviewNormalizer/%s' % version, 'applicationSuite': 'Co-inform', 'isBasedOn': [], # no dependencies 'launchConfiguration': {} # no configs? } ident = hashu.hash_dict( dictu.select_keys(result, content.ident_keys(result))) return {**result, 'identifier': ident}
def bot_info(sub_bots, cfg): result = { '@context': ci_context, '@type': 'CredReviewer', 'additionalType': content.super_types('CredReviewer'), 'name': 'ESI Top-level Credibility Reviewer', 'description': 'Reviews the credibility of various supported content items, mainly by delegating to the appropriate content-level reviewer', 'author': bot_describer.esiLab_organization(), 'dateCreated': '2020-04-02T18:05:00Z', 'applicationCategory': ['Disinformation Detection'], 'softwareRequirements': ['python'], 'softwareVersion': version, 'executionEnvironment': bot_describer.inspect_execution_env(), 'isBasedOn': sub_bots, 'launchConfiguration': {}, 'taskConfiguration': {} } return { **result, 'identifier': hashu.hash_dict(dictu.select_keys( result, content.itemref_keys(result) ))}
def sim_reviewer(vec_space, index_format): semenc_info = vec_space['semantic_encoder_info_fn']() result = { '@context': 'http://coinform.eu', '@type': 'SemSentSimReviewer', 'additionalType': ['SoftwareApplication', 'Bot'], 'name': 'ESI Sentence Similarity Reviewer %s' % index_format, 'description': 'Claim neural index that uses a semantic similarity measure based on a semantic encoder. It achieved 83% accuracy on STS-B.', 'author': bot_describer.esiLab_organization(), 'dateCreated': '2020-03-19T15:09:00Z', 'applicationCategory': ['NLP'], 'applicationSubCategory': ['SemanticSimilarity'], 'applicationSuite': ['Co-inform'], 'softwareRequirements': ['python', 'numpy'], 'softwareVersion': '0.1.0-%s' % index_format, 'executionEnvironment': bot_describer.inspect_execution_env(), 'isBasedOn': [semenc_info], 'launchConfiguration': { 'vecSpace': vec_space['dataset_info'] } } result['identifier'] = calc_sim_reviewer_id(result) return result
def load_tsv_vector_space(tsv_vecs_path, sep='\t'): """load the word embeddings file and create a vecspace dict that stores vectors with their correlated information and indices useful for searching the spece. :param tsv_vecs_path: path to upload the stored embeddings :type tsv_vecs_path: str :param sep: separator of the embeddings file :type sep: str :return: dictionary that contains the embeddings `labels`, the numpy array of word `vectors`, the created `faiss_index`, the `source` path of the embeddings and the number of embeddings dimensions `dim` :rtype: dict """ labels = [] vectors = [] start = time.time() logger.info('Loading vectors from %s' % tsv_vecs_path) ndims = None with open(tsv_vecs_path, 'r', encoding='utf-8') as vecs_f: for line_idx, line in enumerate(vecs_f.readlines()): elems = line.split(sep) labels.append(elems[0]) if ndims is None: ndims = len(elems[1:]) msg = 'line %d, expecting %d dims, but %d' % (line_idx, ndims, len(elems[1:])) assert ndims == len(elems[1:]), msg vectors.append( np.array(list(map(float, elems[1:])), dtype=np.float32)) vectors = np.vstack(vectors) labels_set = set(labels) if len(labels_set) != len(labels): logger.warn("Repeated labels, %d vs %d" % (len(labels), len(labels_set))) ndims = vectors.shape[1] assert ndims == ndims, '%d != %d' % (ndims, ndims) logger.info('Loaded %d vectors in %ds' % (len(labels), (time.time() - start))) nvectors = normalize(vectors) return { 'labels': labels, 'vectors': nvectors, 'faiss_index': create_faiss_index(nvectors, ndims), 'source': tsv_vecs_path, 'dim': ndims, 'dataset_info': { '@context': 'http://schema.org', '@type': 'Dataset', 'name': 'Co-inform Sentence embeddings', 'identifier': hashu.sha256_file(tsv_vecs_path), 'description': 'Dataset of %d sentence embeddings extracted from claim reviews and articles collected as part of the Co-inform project' % len(labels), 'dateCreated': isodate.as_utc_timestamp(os.path.getctime(tsv_vecs_path)), 'dateModified': isodate.as_utc_timestamp(os.path.getmtime(tsv_vecs_path)), 'creator': bot_describer.esiLab_organization(), 'encoding': { '@type': 'MediaObject', 'contentSize': bot_describer.readable_file_size(tsv_vecs_path), 'encodingFormat': 'text/tab-separated-values' } } }