def filter_samples(input_matrix, mend_results_path, mend_manifest_path, clinical_path, nofilter=False, verbose=False): """Filters samples by QC and tumor status. Takes path to samples expression file; tgz of MEND QC results; manifest of MEND QC files to sample ids; and clinical data file. Returns data frame of sample expression filtered to contain only MEND QC pass tumor samples. Filters are skippable via nofilter if, for example, clinical and qc files aren't available.""" print_v = print if verbose else lambda *a, **k: None print_v("Loading sample matrix {}".format(input_matrix)) raw_samples = utils.read_rds(input_matrix) if nofilter: print_v("Skipped the QC and tumor status filters; using all samples.") return raw_samples # Filter to only sample ids with a MEND QC status of PASS # Samples not mentioned in the qc_status (ie, no qc result) will be dropped by the filter qc_status = get_qc_status(mend_results_path, mend_manifest_path) qc_pass_ids = [k for k, v in qc_status.items() if v == "PASS"] qc_pass_ids_in_dataset = set(raw_samples.columns).intersection(qc_pass_ids) print_v("QC filter: dropped {} samples that were not MEND QC PASS.".format( len(raw_samples.columns) - len(qc_pass_ids_in_dataset))) # Call out any sample ids that had no corresponding QC file, but ok to continue even if some are present samples_without_qc_results = set(raw_samples.columns) - set( qc_status.keys()) if len(samples_without_qc_results): print( "WARNING: {} samples were dropped due to missing MEND QC results. Sample IDs: {}" .format(len(samples_without_qc_results), ", ".join(samples_without_qc_results))) # Further filter to only tumor RNA-Seq sample ids clinical = utils.read_tsv(clinical_path) tumor_ids = clinical[(clinical["sample_type"] == "Tumor") & (clinical["composition"] == "Solid Tissue") & (clinical["experimental_strategy"] == "RNA-Seq")].index qc_pass_tumor_ids_in_dataset = qc_pass_ids_in_dataset.intersection( tumor_ids) print_v( "Tumor filter: dropped {} samples that were not solid tumor RNA-Seq.". format( len(qc_pass_ids_in_dataset) - len(qc_pass_tumor_ids_in_dataset))) # And apply filtered IDs to the original dataset, retaining the original column order qc_pass_tumor_samples = raw_samples[[ sid for sid in raw_samples.columns if sid in qc_pass_tumor_ids_in_dataset ]] return qc_pass_tumor_samples
def prepare_commonvoice(commonvoice_location, audio_path, text_path, lists_path, processes): for f in ['dev', 'test', 'train']: dst_list = os.path.join(lists_path, f"commonvoice-{f}.lst") dst_text = os.path.join(text_path, f"commonvoice-{f}.txt") if not os.path.exists(dst_list): to_list = partial(commonvoice_to_list, audio_path, f, commonvoice_location) with Pool(processes) as p: rows = read_tsv(os.path.join(commonvoice_location, f"{f}.tsv")) samples = list(tqdm( p.imap(to_list, rows), total=len(rows), )) with open(dst_list, "w") as list_f: list_f.writelines(samples) with open(dst_list, "r") as list_f, open(dst_text, "w") as text_f: for line in list_f: text_f.write(" ".join(line.strip().split(" ")[3:]) + "\n") else: print(f"{dst_list} exists, doing verify") new_list = [] with open(dst_list, "r") as list_f: for line in list_f: filename = line.split(" ")[1] text = " ".join(line.strip().split(" ")[3:]) params = " ".join(line.strip().split(" ")[:3]) text = remove_punct(text) line = f"{params} {text}\n" if not os.path.exists(filename) or len( text) < 2 or not alpha.match(text): print( f"{filename} does not exists or text is empty, text: {text}" ) else: new_list.append(line) with open(dst_list, "w") as list_f: list_f.writelines(new_list) print("Prepared CommonVoice", flush=True)
def get_qc_status(mend_results_path, mend_manifest_path): """Takes: mend results tgz containing bam_umend_qc.tsv files, and manifest containing mapping from filename to sample id. returns dictionary from sample id to mend status: PASS or FAIL""" mend_tgz = tarfile.open(mend_results_path) qc_files = (i for i in mend_tgz.getmembers() if i.name.endswith("bam_umend_qc.tsv")) # Dictionary of filename (UUID.bam_umend_qc.tsv) to PASS or FAIL string filename_map = { i.name: extract_sample_qc_status(mend_tgz.extractfile(i), i.name) for i in qc_files } # Find sample ID entries in manifest and map them via filename. manifest = utils.read_tsv(mend_manifest_path) # If a QC result file is not listed in the manifest, this throws KeyError # This would indicate major failure in the QC script and should interrupt analysis. return { manifest.loc[k]["Kids.First.Biospecimen.ID"]: v for k, v in filename_map.items() }
def get_qc_status(mend_results_path, mend_manifest_path): """Takes: mend results tgz containing bam_umend_qc.tsv files, and manifest containing mapping from filename to sample id. returns dictionary from sample id to mend status: PASS or FAIL""" # Find sample file entries from manifest. manifest = utils.read_tsv(mend_manifest_path) manifest_files = [ file for file in manifest.index.values if file.endswith("bam_umend_qc.tsv") ] # Open the tarfile and process the QC files mend_tgz = tarfile.open(mend_results_path) qc_files = (i for i in mend_tgz.getmembers() if i.name in manifest_files) # Dictionary of filename (UUID.bam_umend_qc.tsv) to PASS or FAIL string filename_map = { i.name: extract_sample_qc_status(mend_tgz.extractfile(i), i.name) for i in qc_files } return { manifest.loc[k]["Kids.First.Biospecimen.ID"]: v for k, v in filename_map.items() }
def get_train_examples(self, data_dir): return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")
if '--sem-sentido' in argv: nosense_flag = '_unconsidering_sense' else: print( 'Estamos considerando sentido por default (--sem-sentido para não considerar).' ) nosense_flag = '' outpath = pardir / f'genome_annotation/head_genes_correlations{nosense_flag}.tsv' out_aggregated_counts = pardir / f'counted_reads/aggregated{nosense_flag}.tsv' outfile = safe_open(outpath) print('Buscando comprimentos de genes e heads...') gene_attibutes = read_tsv(pardir / 'genome_annotation/gene_annotations.gff3', names=GFF3_COLUMNS, usecols=['attributes'])['attributes'] head_attibutes = read_tsv(pardir / 'genome_annotation/head_annotations.gff3', names=GFF3_COLUMNS, usecols=['attributes'])['attributes'] gene_lengths = parse_gff_attributes(gene_attibutes, gene_id='Name')['length'] head_lengths = parse_gff_attributes(head_attibutes)['length'] lengths = pd.concat([head_lengths, gene_lengths]).astype(int) print('Concluído. Lendo arquivo de relações...') relations = read_tsv( pardir / f'genome_annotation/head_genes_relations{nosense_flag}.tsv') if outfile is not None: outfile.write('\t'.join(relations.columns) + '\tcorrelation\n')
def get_dev_examples(self, data_dir): return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def _load_sts_benchmark_dataset(dframe_file): dframe = read_tsv(dframe_file) dframe["Score"] = np.array(dframe['column_4'], dtype=np.float32) X, y = df_2_dset(dframe, sent1_col="column_5", sent2_col="column_6") return X, y
os.chdir(dl_dir) sites = glob("*") participants = pd.DataFrame([]) participants_missing = pd.DataFrame([]) for site_orig_name in sites: site = site_orig_name.replace("_", "") # without _ print("Converting {}".format(site)) site_dir = os.path.join(dl_dir, site_orig_name) # merge participants files participants_file = os.path.join(site_dir, "participants.tsv") if os.path.isfile(participants_file): participants_site = read_tsv(participants_file) participants_site["site"] = site participants_site["orig_participant_id"] = participants_site[ "participant_id"] participants_site["participant_id"] = participants_site[ "site"] + "x" + participants_site["orig_participant_id"] participants = pd.concat((participants, participants_site)) else: participants_site = pd.DataFrame({ "site": [site], "participant_id": "participants file missing" }) participants_missing = pd.concat( (participants_missing, participants_site))
#!/bin/env python import utils directory = utils.data_location / 'golbeck_fakenews' # this input file has been exported to TSV from `Fake News Stories.xlsx` input_file = directory / 'intermediate' / 'data.tsv' data = utils.read_tsv(input_file) result = [{'url': row['URL of article'], 'label': 'fake', 'source': 'golbeck_fakenews'} for row in data if row['Fake or Satire?'].strip() == 'Fake'] utils.write_json_with_path(result, directory, 'urls.json') by_domain = utils.compute_by_domain(result) utils.write_json_with_path(by_domain, directory, 'domains.json') rebuttals = {el['URL of article']: {u.strip(): ['golbeck_fakenews'] for u in el['URL of rebutting article'].split('; ')} for el in data} utils.write_json_with_path(rebuttals, directory, 'rebuttals.json')
default='data/glove.6B.300d.txt', type=str) args = parser.parse_args() w2v = args.vectorization_method PoS = args.PoS_method NER = args.NER_method regressor = args.regressor if w2v == 'glove': _load_glove(args.glovefile, verbose=args.verbose) X_train, y_train = _load_sts_benchmark_dataset(args.training_set) X_dev, y_dev = _load_sts_benchmark_dataset(args.dev_set) X_test, y_test = _load_sts_benchmark_dataset(args.test_set) rest_dframe = read_tsv(args.companion_other_set) rest_dframe["Score"] = np.array(rest_dframe['column_3'], dtype=np.float32) X_rest, y_rest = df_2_dset(rest_dframe, sent1_col="column_4", sent2_col="column_5") if args.evaluate: if args.training_estimator is None: training_estimator = _build_distance_estimator(X_train, y_train, w2v, PoS, NER, regressor, verbose=1)
#!/bin/env python import csv import itertools import utils subfolder_path = utils.data_location / 'mrisdal_fakenews' data = utils.read_tsv(subfolder_path / 'source' / 'fake.csv', delimiter=',') # set([el['type'] for el in data]) by_type_fn = lambda el: el['type'] cnt_by_type = { k: len(list(v)) for k, v in itertools.groupby(sorted(data, key=by_type_fn), key=by_type_fn) } print('types', cnt_by_type) by_site_fn = lambda el: el['site_url'] types_by_domain = { k: set([el['type'] for el in v]) for k, v in itertools.groupby(sorted(data, key=by_site_fn), key=by_site_fn) } mappings = { 'fake': 'fake', 'junksci': 'fake', 'hate': 'fake', 'bs': 'fake', 'bias': 'fake',
#!/bin/env python import utils location = utils.data_location / 'wikipedia' data = utils.read_tsv(location / 'source' / 'wikipedia.tsv') domains = [{ 'domain': el['url'], 'label': el['label'], 'source': 'wikipedia' } for el in data] utils.write_json_with_path(domains, location, 'domains.json')
from utils import read_tsv, pardir, redo_flag, overlaps, verbose, log, progress_flag from re import findall from time import time # NÃO ESTÁ SE LEVANDO EM CONSIDERAÇÃO OS SENTIDOS DAS FITAS. samdir = pardir / 'alinhamentos' / 'SRA_vs_genoma' outdir = pardir / 'alinhamentos' / 'SRA_vs_genoma_on_heads' def len_from_cigar(cigar): return sum([int(i) for i in findall(r'\d+', cigar)]) if __name__ == '__main__': heads = read_tsv(pardir / 'genome_annotation' / 'head_annotations.gff3', header=None) heads['id'] = heads[8].apply(lambda s: s.strip('gene_id=').split(';')[0]) heads = heads.iloc[:, [0, 3, 4, -4, -1]] heads.columns = ['chrom', 'start', 'end', 'sense', 'id'] head_groups = heads.groupby('chrom') for sam_path in samdir.glob('*.sam'): outpath = outdir / (sam_path.stem + '.tsv') if outpath.exists() and not redo_flag: print(f"'{str(outpath)}' existe. Use -r se quiser sobrescrever.") continue print(f"\nTrabalhando com arquivo '{str(sam_path)}'.") #================== get n lines of sam file ===================#
#!/bin/env python from collections import defaultdict import utils subfolder = utils.data_location / 'jruvika_fakenews' data = utils.read_tsv(subfolder / 'source' / 'data.csv', delimiter=',') print(len(data)) # lots of urls are duplicated by_url = defaultdict(set) for el in data: # two rows have two different URLs each keys = [k.strip() for k in el['URLs'].split('; ')] value = 'true' if el['Label'] == '1' else 'fake' for k in keys: by_url[k].add(value) # be sure that when there are duplicates, the label is the same assert len(by_url[k]) == 1 urls = [{ 'url': k, 'label': v.pop(), 'source': 'jruvika_fakenews' } for k, v in by_url.items()] #by_url = {el['URLs'].strip(): 'true' if el['Label'] == '1' else 'fake' for el in data} #urls = [{'url': k, 'label': v, 'source': 'jruvika_fakenews'} for k,v in by_url.items()] print('unique urls', len(urls)) utils.write_json_with_path(urls, subfolder, 'urls.json')
import pandas as pd import os, argparse from glob import glob from utils import run, mkdir, to_tsv, read_tsv, add_info_to_json if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('bids_dir', help='bids dir') args = parser.parse_args() bids_dir = os.path.abspath(args.bids_dir) # Load data os.chdir(bids_dir) subjects = sorted(glob("sub-*")) #remove MPG1x subjects, as bids validator: We were unable to parse header data from this NIfTI file (code: 26 - NIFTI_HEADER_UNREADABLE) subjects = list(filter(lambda s: "MPG1x" not in s, subjects)) df = pd.DataFrame([]) for s in subjects: print(s) site = s.split("x")[0].strip("sub-") df_file = os.path.join(bids_dir, s, "{}_sessions.tsv".format(s)) if os.path.isfile(df_file): df_ = read_tsv(df_file, na_values=["n/a"]) df_["site"] = site df = pd.concat((df, df_)) to_tsv(df, "session_info.tsv")
# description: Plots heads' to nearest gene(s) expression correlation as a function of heads' motherlengths. # in: pardir/'genome_annotation/heads_motherlength.tsv' pardir/'genome_annotation/head_genes_correlations.tsv' # out: # plot: from utils import read_tsv, pardir, save_all_figs, show_flag import pandas as pd from seaborn import heatmap from matplotlib import pyplot as plt from plot_read_counts import plot_heatmap ml = read_tsv(pardir / 'genome_annotation/heads_motherlength.tsv', header=None, names=['head_id', 'ml'], index_col='head_id') corr = read_tsv(pardir / 'genome_annotation/head_genes_correlations.tsv', usecols=['head_id', 'correlation'], index_col='head_id') ml_corr = pd.merge(ml, corr, left_index=True, right_index=True).dropna() #print(ml_corr) ml_corr.plot('ml', 'correlation', 'scatter', alpha=.2) plt.xlabel('Comprimento-mãe (bp)') plt.ylabel('Correlação transcricional com o gene vizinho') ml_corr.plot.hexbin('ml', 'correlation', gridsize=15, cmap='viridis') plt.xlabel('Comprimento-mãe (bp)') plt.ylabel('Correlação transcricional com o gene vizinho') #======================== heatmap ========================#
#'unreliable': 'fake', 'reliable': 'true' } properties = ['type', '2nd type', '3rd type'] results = [] # find the properties belonging to the mappings in the samples, and assign a single label for domain, props in data.items(): looking_at = [ prop_value for prop_name, prop_value in props.items() if prop_name in properties and prop_value ] #print(looking_at) classes = set(mappings[el] for el in looking_at if el in mappings) if len(classes) != 1: print(domain, classes) continue label = classes.pop() results.append({'domain': domain, 'label': label, 'source': source}) utils.write_json_with_path(results, output_folder, 'domains.json') data_opensources = utils.read_json(subfolder_opensources / 'source' / 'sources' / 'sources.json') data_melissa_tsv = utils.read_tsv(subfolder_melissa / 'source' / 'melissa_zimdars_spreadsheet.tsv') data_melissa = {el['url']: el for el in data_melissa_tsv} process(data_opensources, subfolder_opensources, 'opensources') process(data_melissa, subfolder_melissa, 'melissa_zimdars')
def get_test_examples(self, data_dir): return self._create_examples( read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched" )
#!/bin/env python import utils location = utils.data_location / 'factcheckni_list' data = utils.read_tsv(location / 'source' / 'FactCheckNI Articles - OU Research - Sheet1.tsv') label_map = { 'Accurate': 'true', # 'Unsubstantiated': not true nor folse, no proofs --> discard 'Inaccurate': 'fake' } labeled_urls = [{ 'url': row['Claim URL'], 'label': label_map[row['Label']], 'source': 'factcheckni_list' } for row in data if row['Label'] in label_map] rebuttals = { row['Claim URL']: { row['Article URL']: ['factcheckni_list'] } for row in data } utils.write_json_with_path(labeled_urls, location, 'urls.json') utils.write_json_with_path(rebuttals, location, 'rebuttals.json')
cnt = 0 with open("/home/clli/w2v/glove.840B.300d.txt", 'r', encoding='utf-8') as f: data = [line.replace("\n", "") for line in f.readlines()] for line in data: cnt += 1 if cnt % 100000 == 0: print(cnt) line = line.split() word = ' '.join(line[:-300]) embedding = [num for num in line[-300:]] bigEmbedding[word] = embedding print("getting small embedding..") wordbag.append('<PAD>') resultEmb.append(' '.join([str(0.0)] * 300)) for filename in filelist: data = read_tsv(filename) for line in data: text = line[0] text2id = [] sep_text = sepratewords(text).split() for word in sep_text: if word in smallEmbedding: continue elif word not in bigEmbedding: temp_emb = np.random.rand(300) smallEmbedding[word] = ' '.join( [str(num) for num in list(temp_emb)]) wordbag.append(word) resultEmb.append(smallEmbedding[word]) else:
filecolumns = { 'cbsnews': default_mapping(), 'dailydot': default_mapping(), 'fakenewswatch': fakenewswatch_mapping(), 'newrepublic': default_mapping(), 'npr': default_mapping(), 'snopes': default_mapping(), 'thoughtco': default_mapping(), 'usnews': usnews_mapping() } all_domains = [] for source, mappings in filecolumns.items(): data = utils.read_tsv(location / 'intermediate' / '{}.tsv'.format(source)) print(source) domains = [{ 'domain': el[mappings['domain_col']], 'label': 'true' if el[mappings['label_col']] in mappings['true_vals'] else 'fake', 'source': 'domain_list_{}'.format(source) } for el in data if el[mappings['label_col']] in mappings['true_vals'] + mappings['fake_vals']] all_domains.extend(domains) utils.write_json_with_path(all_domains, location, 'domains.json')
outdicpath = pardir / 'genome_annotation' / 'head_genes_relations.dic' outpath = pardir / 'genome_annotation' / 'head_genes_relations.tsv' outfile = outpath.open('w') # Segurança para não sobrescrever como eu fiz agora >.< if outpath.exists() and not redo_flag: print( f"Arquivo '{str(outpath)}' já existe, nada será feito. Use '-r' se quiser sobrescrever." ) exit() GFF_COLS = ['chrom', 'start', 'end', 'sense', 'id'] # ESTAMOS CONSIDERANDO SENTIDO heads = read_tsv(pardir / 'genome_annotation/head_annotations.gff3', header=None) heads.drop([1, 2, 5, 7], inplace=True, axis=1) heads[8] = heads[8].apply(lambda s: s.strip('gene_id=').split(';')[0]) heads.columns = GFF_COLS head_groups = heads.groupby(['chrom', 'sense']) genes = read_tsv(pardir / 'genome_annotation/gene_annotations.gff3', header=None) genes.drop([1, 2, 5, 7], inplace=True, axis=1) genes[8] = genes[8].apply(lambda s: s.strip('gene_id=').split(';')[0]) genes.columns = GFF_COLS # ### alterar um pouquinho as duplicatas muahahah genes.loc[genes.duplicated('start', 'last'), 'start'] += 1 genes.loc[genes.duplicated('end', 'last'), 'end'] += 1
#!/bin/env python import os from collections import defaultdict from tqdm import tqdm import utils import unshortener location = utils.data_location / 'rbutr' data = utils.read_tsv(location / 'source' / 'link_data.tab.txt') results = [{ 'url': el['sourcepage'], 'label': 'fake', 'source': 'rbutr' } for el in data] utils.write_json_with_path(results, location, 'urls.json') domains = utils.compute_by_domain(results) utils.write_json_with_path(results, location, 'domains.json') rebuttals = defaultdict(lambda: defaultdict(list)) for row in data: rebuttals[row['sourcepage']][row['rebuttalpage']].append('rbutr') utils.write_json_with_path(rebuttals, location, 'rebuttals.json')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model explanations will be written.") parser.add_argument("--input_file", default=None, type=str, required=True, help="The input file where the examples are present.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--generate_html", action='store_true', help="Set this flag if you want to generate html for each example.") args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, do_lower_case=False) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) model.to(args.device) c = Classifier(args, model, tokenizer) explainer = LimeTextExplainer( class_names=["Not paraphrases", "Paraphrases"]) lines = read_tsv(args.input_file) oneoutfile = os.path.join(args.output_dir, 'expl_summary.tsv') with open(oneoutfile, "w") as of: for i in trange(len(lines), desc="Instance"): line = lines[i] exp = explainer.explain_instance(line[3], line[4], int(line[0]), c.predict_proba, num_features=6, num_samples=10) summary_line = str(i + 1) for e in exp.as_list(): summary_line += "\t" + e[0] + "\t" + "%3f" % e[1] summary_line += "\n" of.write(summary_line) if args.generate_html: outfilename = os.path.join(args.output_dir, '_'.join([str(i + 1), 'exp.html'])) exp.save_to_file(outfilename)
import re import os import xml.etree.ElementTree as ET from xml.dom import minidom from lxml import etree import urllib.parse as urlparse from bs4 import BeautifulSoup import html.parser as htmlparser parser = htmlparser.HTMLParser() import utils folder = utils.data_location / 'buzzface' source_file = folder / 'source' / 'facebook-fact-check.tab' source_data = utils.read_tsv(source_file) data = {el['post_id']: {'url': el['Post URL'], 'label': el['Rating']} for el in source_data if el['Post Type'] == 'link'} # TODO should filter there the interesting classes? print(len(data)) # download the facebook page for id, el in data.items(): file_path = folder / 'intermediate' / '{}.html'.format(id) if not os.path.isfile(file_path): response = requests.get(el['url']) utils.write_file_with_path(response.text, folder / 'intermediate', '{}.html'.format(id)) unfiltered = []