def filter_samples(input_matrix,
                   mend_results_path,
                   mend_manifest_path,
                   clinical_path,
                   nofilter=False,
                   verbose=False):
    """Filters samples by QC and tumor status.
    Takes path to samples expression file; tgz of MEND QC results;
    manifest of MEND QC files to sample ids; and clinical data file.
    Returns data frame of sample expression filtered to contain only
    MEND QC pass tumor samples. Filters are skippable via nofilter if,
    for example, clinical and qc files aren't available."""
    print_v = print if verbose else lambda *a, **k: None

    print_v("Loading sample matrix {}".format(input_matrix))
    raw_samples = utils.read_rds(input_matrix)

    if nofilter:
        print_v("Skipped the QC and tumor status filters; using all samples.")
        return raw_samples

    # Filter to only sample ids with a MEND QC status of PASS
    # Samples not mentioned in the qc_status (ie, no qc result) will be dropped by the filter
    qc_status = get_qc_status(mend_results_path, mend_manifest_path)
    qc_pass_ids = [k for k, v in qc_status.items() if v == "PASS"]
    qc_pass_ids_in_dataset = set(raw_samples.columns).intersection(qc_pass_ids)
    print_v("QC filter: dropped {} samples that were not MEND QC PASS.".format(
        len(raw_samples.columns) - len(qc_pass_ids_in_dataset)))

    # Call out any sample ids that had no corresponding QC file, but ok to continue even if some are present
    samples_without_qc_results = set(raw_samples.columns) - set(
        qc_status.keys())
    if len(samples_without_qc_results):
        print(
            "WARNING: {} samples were dropped due to missing MEND QC results. Sample IDs: {}"
            .format(len(samples_without_qc_results),
                    ", ".join(samples_without_qc_results)))

    # Further filter to only tumor RNA-Seq sample ids
    clinical = utils.read_tsv(clinical_path)
    tumor_ids = clinical[(clinical["sample_type"] == "Tumor")
                         & (clinical["composition"] == "Solid Tissue")
                         & (clinical["experimental_strategy"]
                            == "RNA-Seq")].index
    qc_pass_tumor_ids_in_dataset = qc_pass_ids_in_dataset.intersection(
        tumor_ids)
    print_v(
        "Tumor filter: dropped {} samples that were not solid tumor RNA-Seq.".
        format(
            len(qc_pass_ids_in_dataset) - len(qc_pass_tumor_ids_in_dataset)))

    # And apply filtered IDs to the original dataset, retaining the original column order
    qc_pass_tumor_samples = raw_samples[[
        sid for sid in raw_samples.columns
        if sid in qc_pass_tumor_ids_in_dataset
    ]]
    return qc_pass_tumor_samples
Пример #2
0
def prepare_commonvoice(commonvoice_location, audio_path, text_path,
                        lists_path, processes):
    for f in ['dev', 'test', 'train']:
        dst_list = os.path.join(lists_path, f"commonvoice-{f}.lst")
        dst_text = os.path.join(text_path, f"commonvoice-{f}.txt")
        if not os.path.exists(dst_list):
            to_list = partial(commonvoice_to_list, audio_path, f,
                              commonvoice_location)
            with Pool(processes) as p:
                rows = read_tsv(os.path.join(commonvoice_location, f"{f}.tsv"))
                samples = list(tqdm(
                    p.imap(to_list, rows),
                    total=len(rows),
                ))
            with open(dst_list, "w") as list_f:
                list_f.writelines(samples)

            with open(dst_list, "r") as list_f, open(dst_text, "w") as text_f:
                for line in list_f:
                    text_f.write(" ".join(line.strip().split(" ")[3:]) + "\n")

        else:
            print(f"{dst_list} exists, doing verify")
            new_list = []
            with open(dst_list, "r") as list_f:
                for line in list_f:
                    filename = line.split(" ")[1]
                    text = " ".join(line.strip().split(" ")[3:])
                    params = " ".join(line.strip().split(" ")[:3])
                    text = remove_punct(text)
                    line = f"{params} {text}\n"
                    if not os.path.exists(filename) or len(
                            text) < 2 or not alpha.match(text):
                        print(
                            f"{filename} does not exists or text is empty, text: {text}"
                        )
                    else:
                        new_list.append(line)
            with open(dst_list, "w") as list_f:
                list_f.writelines(new_list)

    print("Prepared CommonVoice", flush=True)
Пример #3
0
def get_qc_status(mend_results_path, mend_manifest_path):
    """Takes: mend results tgz containing bam_umend_qc.tsv files, and
    manifest containing mapping from filename to sample id.
    returns dictionary from sample id to mend status: PASS or FAIL"""
    mend_tgz = tarfile.open(mend_results_path)
    qc_files = (i for i in mend_tgz.getmembers()
                if i.name.endswith("bam_umend_qc.tsv"))
    # Dictionary of filename (UUID.bam_umend_qc.tsv) to PASS or FAIL string
    filename_map = {
        i.name: extract_sample_qc_status(mend_tgz.extractfile(i), i.name)
        for i in qc_files
    }
    # Find sample ID entries in manifest and map them via filename.
    manifest = utils.read_tsv(mend_manifest_path)

    # If a QC result file is not listed in the manifest, this throws KeyError
    # This would indicate major failure in the QC script and should interrupt analysis.
    return {
        manifest.loc[k]["Kids.First.Biospecimen.ID"]: v
        for k, v in filename_map.items()
    }
def get_qc_status(mend_results_path, mend_manifest_path):
    """Takes: mend results tgz containing bam_umend_qc.tsv files, and
    manifest containing mapping from filename to sample id.
    returns dictionary from sample id to mend status: PASS or FAIL"""
    # Find sample file entries from manifest.
    manifest = utils.read_tsv(mend_manifest_path)
    manifest_files = [
        file for file in manifest.index.values
        if file.endswith("bam_umend_qc.tsv")
    ]
    # Open the tarfile and process the QC files
    mend_tgz = tarfile.open(mend_results_path)
    qc_files = (i for i in mend_tgz.getmembers() if i.name in manifest_files)
    # Dictionary of filename (UUID.bam_umend_qc.tsv) to PASS or FAIL string
    filename_map = {
        i.name: extract_sample_qc_status(mend_tgz.extractfile(i), i.name)
        for i in qc_files
    }
    return {
        manifest.loc[k]["Kids.First.Biospecimen.ID"]: v
        for k, v in filename_map.items()
    }
Пример #5
0
 def get_train_examples(self, data_dir):
     return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")
Пример #6
0
if '--sem-sentido' in argv:
    nosense_flag = '_unconsidering_sense'

else:
    print(
        'Estamos considerando sentido por default (--sem-sentido para não considerar).'
    )
    nosense_flag = ''

outpath = pardir / f'genome_annotation/head_genes_correlations{nosense_flag}.tsv'
out_aggregated_counts = pardir / f'counted_reads/aggregated{nosense_flag}.tsv'
outfile = safe_open(outpath)

print('Buscando comprimentos de genes e heads...')
gene_attibutes = read_tsv(pardir / 'genome_annotation/gene_annotations.gff3',
                          names=GFF3_COLUMNS,
                          usecols=['attributes'])['attributes']
head_attibutes = read_tsv(pardir / 'genome_annotation/head_annotations.gff3',
                          names=GFF3_COLUMNS,
                          usecols=['attributes'])['attributes']
gene_lengths = parse_gff_attributes(gene_attibutes, gene_id='Name')['length']
head_lengths = parse_gff_attributes(head_attibutes)['length']
lengths = pd.concat([head_lengths, gene_lengths]).astype(int)

print('Concluído. Lendo arquivo de relações...')
relations = read_tsv(
    pardir / f'genome_annotation/head_genes_relations{nosense_flag}.tsv')

if outfile is not None:
    outfile.write('\t'.join(relations.columns) + '\tcorrelation\n')
Пример #7
0
 def get_dev_examples(self, data_dir):
     return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
Пример #8
0
def _load_sts_benchmark_dataset(dframe_file):
    dframe = read_tsv(dframe_file)
    dframe["Score"] = np.array(dframe['column_4'], dtype=np.float32)
    X, y = df_2_dset(dframe, sent1_col="column_5", sent2_col="column_6")
    return X, y
Пример #9
0
    os.chdir(dl_dir)
    sites = glob("*")

    participants = pd.DataFrame([])
    participants_missing = pd.DataFrame([])

    for site_orig_name in sites:
        site = site_orig_name.replace("_", "")  # without _
        print("Converting {}".format(site))
        site_dir = os.path.join(dl_dir, site_orig_name)

        # merge participants files
        participants_file = os.path.join(site_dir, "participants.tsv")
        if os.path.isfile(participants_file):
            participants_site = read_tsv(participants_file)
            participants_site["site"] = site
            participants_site["orig_participant_id"] = participants_site[
                "participant_id"]
            participants_site["participant_id"] = participants_site[
                "site"] + "x" + participants_site["orig_participant_id"]
            participants = pd.concat((participants, participants_site))
        else:
            participants_site = pd.DataFrame({
                "site": [site],
                "participant_id":
                "participants file missing"
            })
            participants_missing = pd.concat(
                (participants_missing, participants_site))
Пример #10
0
#!/bin/env python

import utils

directory = utils.data_location / 'golbeck_fakenews'

# this input file has been exported to TSV from `Fake News Stories.xlsx`
input_file = directory / 'intermediate' / 'data.tsv'

data = utils.read_tsv(input_file)

result = [{'url': row['URL of article'], 'label': 'fake', 'source': 'golbeck_fakenews'} for row  in data if row['Fake or Satire?'].strip() == 'Fake']

utils.write_json_with_path(result, directory, 'urls.json')

by_domain = utils.compute_by_domain(result)

utils.write_json_with_path(by_domain, directory, 'domains.json')

rebuttals = {el['URL of article']: {u.strip(): ['golbeck_fakenews'] for u in el['URL of rebutting article'].split('; ')} for el in data}

utils.write_json_with_path(rebuttals, directory, 'rebuttals.json')
Пример #11
0
                        default='data/glove.6B.300d.txt',
                        type=str)
    args = parser.parse_args()

    w2v = args.vectorization_method
    PoS = args.PoS_method
    NER = args.NER_method
    regressor = args.regressor

    if w2v == 'glove':
        _load_glove(args.glovefile, verbose=args.verbose)

    X_train, y_train = _load_sts_benchmark_dataset(args.training_set)
    X_dev, y_dev = _load_sts_benchmark_dataset(args.dev_set)
    X_test, y_test = _load_sts_benchmark_dataset(args.test_set)
    rest_dframe = read_tsv(args.companion_other_set)
    rest_dframe["Score"] = np.array(rest_dframe['column_3'], dtype=np.float32)
    X_rest, y_rest = df_2_dset(rest_dframe,
                               sent1_col="column_4",
                               sent2_col="column_5")

    if args.evaluate:
        if args.training_estimator is None:
            training_estimator = _build_distance_estimator(X_train,
                                                           y_train,
                                                           w2v,
                                                           PoS,
                                                           NER,
                                                           regressor,
                                                           verbose=1)
#!/bin/env python

import csv
import itertools

import utils

subfolder_path = utils.data_location / 'mrisdal_fakenews'

data = utils.read_tsv(subfolder_path / 'source' / 'fake.csv', delimiter=',')

# set([el['type'] for el in data])
by_type_fn = lambda el: el['type']
cnt_by_type = {
    k: len(list(v))
    for k, v in itertools.groupby(sorted(data, key=by_type_fn), key=by_type_fn)
}
print('types', cnt_by_type)

by_site_fn = lambda el: el['site_url']
types_by_domain = {
    k: set([el['type'] for el in v])
    for k, v in itertools.groupby(sorted(data, key=by_site_fn), key=by_site_fn)
}

mappings = {
    'fake': 'fake',
    'junksci': 'fake',
    'hate': 'fake',
    'bs': 'fake',
    'bias': 'fake',
#!/bin/env python

import utils

location = utils.data_location / 'wikipedia'

data = utils.read_tsv(location / 'source' / 'wikipedia.tsv')

domains = [{
    'domain': el['url'],
    'label': el['label'],
    'source': 'wikipedia'
} for el in data]

utils.write_json_with_path(domains, location, 'domains.json')
Пример #14
0
from utils import read_tsv, pardir, redo_flag, overlaps, verbose, log, progress_flag
from re import findall
from time import time
# NÃO ESTÁ SE LEVANDO EM CONSIDERAÇÃO OS SENTIDOS DAS FITAS.

samdir = pardir / 'alinhamentos' / 'SRA_vs_genoma'
outdir = pardir / 'alinhamentos' / 'SRA_vs_genoma_on_heads'


def len_from_cigar(cigar):
    return sum([int(i) for i in findall(r'\d+', cigar)])


if __name__ == '__main__':

    heads = read_tsv(pardir / 'genome_annotation' / 'head_annotations.gff3',
                     header=None)
    heads['id'] = heads[8].apply(lambda s: s.strip('gene_id=').split(';')[0])
    heads = heads.iloc[:, [0, 3, 4, -4, -1]]
    heads.columns = ['chrom', 'start', 'end', 'sense', 'id']
    head_groups = heads.groupby('chrom')

    for sam_path in samdir.glob('*.sam'):

        outpath = outdir / (sam_path.stem + '.tsv')

        if outpath.exists() and not redo_flag:
            print(f"'{str(outpath)}' existe. Use -r se quiser sobrescrever.")
            continue

        print(f"\nTrabalhando com arquivo '{str(sam_path)}'.")
        #================== get n lines of sam file ===================#
#!/bin/env python
from collections import defaultdict

import utils

subfolder = utils.data_location / 'jruvika_fakenews'

data = utils.read_tsv(subfolder / 'source' / 'data.csv', delimiter=',')

print(len(data))

# lots of urls are duplicated
by_url = defaultdict(set)
for el in data:
    # two rows have two different URLs each
    keys = [k.strip() for k in el['URLs'].split('; ')]
    value = 'true' if el['Label'] == '1' else 'fake'
    for k in keys:
        by_url[k].add(value)
        # be sure that when there are duplicates, the label is the same
        assert len(by_url[k]) == 1
urls = [{
    'url': k,
    'label': v.pop(),
    'source': 'jruvika_fakenews'
} for k, v in by_url.items()]
#by_url = {el['URLs'].strip(): 'true' if el['Label'] == '1' else 'fake' for el in data}
#urls = [{'url': k, 'label': v, 'source': 'jruvika_fakenews'} for k,v in by_url.items()]
print('unique urls', len(urls))

utils.write_json_with_path(urls, subfolder, 'urls.json')
Пример #16
0
import pandas as pd
import os, argparse
from glob import glob
from utils import run, mkdir, to_tsv, read_tsv, add_info_to_json

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('bids_dir', help='bids dir')
    args = parser.parse_args()

    bids_dir = os.path.abspath(args.bids_dir)

    # Load data
    os.chdir(bids_dir)
    subjects = sorted(glob("sub-*"))
    #remove MPG1x subjects, as bids validator: We were unable to parse header data from this NIfTI file (code: 26 - NIFTI_HEADER_UNREADABLE)
    subjects = list(filter(lambda s: "MPG1x" not in s, subjects))

    df = pd.DataFrame([])
    for s in subjects:
        print(s)
        site = s.split("x")[0].strip("sub-")
        df_file = os.path.join(bids_dir, s, "{}_sessions.tsv".format(s))
        if os.path.isfile(df_file):
            df_ = read_tsv(df_file, na_values=["n/a"])
            df_["site"] = site
            df = pd.concat((df, df_))

    to_tsv(df, "session_info.tsv")
Пример #17
0
# description: Plots heads' to nearest gene(s) expression correlation as a function of heads' motherlengths.
# in: pardir/'genome_annotation/heads_motherlength.tsv' pardir/'genome_annotation/head_genes_correlations.tsv'
# out:
# plot:

from utils import read_tsv, pardir, save_all_figs, show_flag
import pandas as pd
from seaborn import heatmap
from matplotlib import pyplot as plt
from plot_read_counts import plot_heatmap

ml = read_tsv(pardir / 'genome_annotation/heads_motherlength.tsv',
              header=None,
              names=['head_id', 'ml'],
              index_col='head_id')

corr = read_tsv(pardir / 'genome_annotation/head_genes_correlations.tsv',
                usecols=['head_id', 'correlation'],
                index_col='head_id')

ml_corr = pd.merge(ml, corr, left_index=True, right_index=True).dropna()
#print(ml_corr)
ml_corr.plot('ml', 'correlation', 'scatter', alpha=.2)
plt.xlabel('Comprimento-mãe (bp)')
plt.ylabel('Correlação transcricional com o gene vizinho')

ml_corr.plot.hexbin('ml', 'correlation', gridsize=15, cmap='viridis')
plt.xlabel('Comprimento-mãe (bp)')
plt.ylabel('Correlação transcricional com o gene vizinho')

#======================== heatmap ========================#
Пример #18
0
        #'unreliable': 'fake',
        'reliable': 'true'
    }
    properties = ['type', '2nd type', '3rd type']
    results = []
    # find the properties belonging to the mappings in the samples, and assign a single label
    for domain, props in data.items():
        looking_at = [
            prop_value for prop_name, prop_value in props.items()
            if prop_name in properties and prop_value
        ]
        #print(looking_at)
        classes = set(mappings[el] for el in looking_at if el in mappings)
        if len(classes) != 1:
            print(domain, classes)
            continue
        label = classes.pop()
        results.append({'domain': domain, 'label': label, 'source': source})

    utils.write_json_with_path(results, output_folder, 'domains.json')


data_opensources = utils.read_json(subfolder_opensources / 'source' /
                                   'sources' / 'sources.json')
data_melissa_tsv = utils.read_tsv(subfolder_melissa / 'source' /
                                  'melissa_zimdars_spreadsheet.tsv')
data_melissa = {el['url']: el for el in data_melissa_tsv}

process(data_opensources, subfolder_opensources, 'opensources')
process(data_melissa, subfolder_melissa, 'melissa_zimdars')
Пример #19
0
 def get_test_examples(self, data_dir):
     return self._create_examples(
         read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched"
     )
Пример #20
0
#!/bin/env python

import utils

location = utils.data_location / 'factcheckni_list'

data = utils.read_tsv(location / 'source' /
                      'FactCheckNI Articles - OU Research - Sheet1.tsv')

label_map = {
    'Accurate': 'true',
    # 'Unsubstantiated': not true nor folse, no proofs --> discard
    'Inaccurate': 'fake'
}

labeled_urls = [{
    'url': row['Claim URL'],
    'label': label_map[row['Label']],
    'source': 'factcheckni_list'
} for row in data if row['Label'] in label_map]

rebuttals = {
    row['Claim URL']: {
        row['Article URL']: ['factcheckni_list']
    }
    for row in data
}

utils.write_json_with_path(labeled_urls, location, 'urls.json')
utils.write_json_with_path(rebuttals, location, 'rebuttals.json')
cnt = 0
with open("/home/clli/w2v/glove.840B.300d.txt", 'r', encoding='utf-8') as f:
    data = [line.replace("\n", "") for line in f.readlines()]
    for line in data:
        cnt += 1
        if cnt % 100000 == 0:
            print(cnt)
        line = line.split()
        word = ' '.join(line[:-300])
        embedding = [num for num in line[-300:]]
        bigEmbedding[word] = embedding
print("getting small embedding..")
wordbag.append('<PAD>')
resultEmb.append(' '.join([str(0.0)] * 300))
for filename in filelist:
    data = read_tsv(filename)
    for line in data:
        text = line[0]
        text2id = []
        sep_text = sepratewords(text).split()

        for word in sep_text:
            if word in smallEmbedding:
                continue
            elif word not in bigEmbedding:
                temp_emb = np.random.rand(300)
                smallEmbedding[word] = ' '.join(
                    [str(num) for num in list(temp_emb)])
                wordbag.append(word)
                resultEmb.append(smallEmbedding[word])
            else:
Пример #22
0

filecolumns = {
    'cbsnews': default_mapping(),
    'dailydot': default_mapping(),
    'fakenewswatch': fakenewswatch_mapping(),
    'newrepublic': default_mapping(),
    'npr': default_mapping(),
    'snopes': default_mapping(),
    'thoughtco': default_mapping(),
    'usnews': usnews_mapping()
}

all_domains = []
for source, mappings in filecolumns.items():
    data = utils.read_tsv(location / 'intermediate' / '{}.tsv'.format(source))
    print(source)
    domains = [{
        'domain':
        el[mappings['domain_col']],
        'label':
        'true'
        if el[mappings['label_col']] in mappings['true_vals'] else 'fake',
        'source':
        'domain_list_{}'.format(source)
    } for el in data if el[mappings['label_col']] in mappings['true_vals'] +
               mappings['fake_vals']]

    all_domains.extend(domains)

utils.write_json_with_path(all_domains, location, 'domains.json')
Пример #23
0
outdicpath = pardir / 'genome_annotation' / 'head_genes_relations.dic'
outpath = pardir / 'genome_annotation' / 'head_genes_relations.tsv'
outfile = outpath.open('w')

# Segurança para não sobrescrever como eu fiz agora >.<
if outpath.exists() and not redo_flag:
    print(
        f"Arquivo '{str(outpath)}' já existe, nada será feito. Use '-r' se quiser sobrescrever."
    )
    exit()

GFF_COLS = ['chrom', 'start', 'end', 'sense', 'id']

# ESTAMOS CONSIDERANDO SENTIDO
heads = read_tsv(pardir / 'genome_annotation/head_annotations.gff3',
                 header=None)
heads.drop([1, 2, 5, 7], inplace=True, axis=1)
heads[8] = heads[8].apply(lambda s: s.strip('gene_id=').split(';')[0])
heads.columns = GFF_COLS
head_groups = heads.groupby(['chrom', 'sense'])

genes = read_tsv(pardir / 'genome_annotation/gene_annotations.gff3',
                 header=None)
genes.drop([1, 2, 5, 7], inplace=True, axis=1)
genes[8] = genes[8].apply(lambda s: s.strip('gene_id=').split(';')[0])
genes.columns = GFF_COLS

# ### alterar um pouquinho as duplicatas muahahah
genes.loc[genes.duplicated('start', 'last'), 'start'] += 1
genes.loc[genes.duplicated('end', 'last'), 'end'] += 1
#!/bin/env python

import os
from collections import defaultdict
from tqdm import tqdm

import utils
import unshortener

location = utils.data_location / 'rbutr'

data = utils.read_tsv(location / 'source' / 'link_data.tab.txt')

results = [{
    'url': el['sourcepage'],
    'label': 'fake',
    'source': 'rbutr'
} for el in data]

utils.write_json_with_path(results, location, 'urls.json')

domains = utils.compute_by_domain(results)

utils.write_json_with_path(results, location, 'domains.json')

rebuttals = defaultdict(lambda: defaultdict(list))
for row in data:
    rebuttals[row['sourcepage']][row['rebuttalpage']].append('rbutr')

utils.write_json_with_path(rebuttals, location, 'rebuttals.json')
Пример #25
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model explanations will be written.")
    parser.add_argument("--input_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input file where the examples are present.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--generate_html",
        action='store_true',
        help="Set this flag if you want to generate html for each example.")
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.model_name_or_path,
                                          num_labels=num_labels,
                                          finetuning_task=args.task_name)
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path,
                                                do_lower_case=False)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config)
    model.to(args.device)

    c = Classifier(args, model, tokenizer)
    explainer = LimeTextExplainer(
        class_names=["Not paraphrases", "Paraphrases"])
    lines = read_tsv(args.input_file)
    oneoutfile = os.path.join(args.output_dir, 'expl_summary.tsv')
    with open(oneoutfile, "w") as of:
        for i in trange(len(lines), desc="Instance"):
            line = lines[i]
            exp = explainer.explain_instance(line[3],
                                             line[4],
                                             int(line[0]),
                                             c.predict_proba,
                                             num_features=6,
                                             num_samples=10)

            summary_line = str(i + 1)
            for e in exp.as_list():
                summary_line += "\t" + e[0] + "\t" + "%3f" % e[1]
            summary_line += "\n"
            of.write(summary_line)

            if args.generate_html:
                outfilename = os.path.join(args.output_dir,
                                           '_'.join([str(i + 1), 'exp.html']))
                exp.save_to_file(outfilename)
import re
import os
import xml.etree.ElementTree as ET
from xml.dom import minidom
from lxml import etree
import urllib.parse as urlparse
from bs4 import BeautifulSoup
import html.parser as htmlparser
parser = htmlparser.HTMLParser()

import utils

folder = utils.data_location / 'buzzface'
source_file = folder / 'source' / 'facebook-fact-check.tab'

source_data = utils.read_tsv(source_file)

data = {el['post_id']: {'url': el['Post URL'], 'label': el['Rating']} for el in source_data if el['Post Type'] == 'link'}
# TODO should filter there the interesting classes?

print(len(data))

# download the facebook page
for id, el in data.items():
    file_path = folder / 'intermediate' / '{}.html'.format(id)
    if not os.path.isfile(file_path):
        response = requests.get(el['url'])
        utils.write_file_with_path(response.text, folder / 'intermediate', '{}.html'.format(id))


unfiltered = []