def get_other_results(queries, qml_rankings, num_ranks=None): document_lookup = read_cache('./doc_lookup.json', get_robust_documents) document_title_to_id = read_cache('./document_title_to_id.json', lambda: print('failed')) document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [ document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids ] tokenizer = Tokenizer( rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces]) tokenized_documents = read_cache('tok_docs.json', lambda: tokenizer.process_all(documents)) tokenized_queries = tokenizer.process_all(queries) bm25 = BM25(tokenized_documents) average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) bm25_rankings = [] glove_rankings = [] rm3_rankings = [] glove = get_glove_lookup(embedding_dim=300, use_large_embed=True) docs_lms = _calc_docs_lms(bm25.df, bm25.f) for q, qml_ranking in progressbar(zip(tokenized_queries, qml_rankings)): bm25_rankings.append( _get_bm25_ranking(bm25, qml_ranking, q, average_idf=average_idf)) glove_rankings.append( _get_glove_ranking(glove, tokenized_documents, qml_ranking, q)) rm3_rankings.append(_get_rm3_ranking(docs_lms, bm25.f, qml_ranking, q)) return bm25_rankings, glove_rankings, rm3_rankings
def main(): document_lookup = read_cache('./doc_lookup.json', get_robust_documents) document_title_to_id = create_id_lookup(document_lookup.keys()) document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids] tokenizer = Tokenizer() tokenized_documents = read_cache('tok_docs.json', lambda: tokenizer.process_all(documents)) bm25 = BM25(tokenized_documents) with open('./doc_word_idf.json', 'w+') as fh: json.dump(bm25.idf, fh)
def get_texts(texts, with_preprocess=False, pre_rules=None): r""" Uses fastai's Tokenizer because it does a series of very convenients things during the tokenization process See here: https://docs.fast.ai/text.transform.html#Tokenizer """ tok_func = Tokenizer() if with_preprocess: texts = [" ".join(simple_preprocess(s)) for s in texts] if pre_rules: tok_func.pre_rules = pre_rules + tok_func.pre_rules tokens = tok_func.process_all(texts) return tokens
def baselines_eval(): rankings_to_eval = read_query_test_rankings() qrels = parse_qrels() query_ids = list(qrels.keys()) query_lookup = get_robust_eval_queries() queries = [query_lookup[query_id] for query_id in query_ids] k = 10 if len(sys.argv) == 1 else int(sys.argv[1]) document_lookup = read_cache(name('./doc_lookup.json', ['with_titles']), get_robust_documents_with_titles) document_title_to_id = read_cache('./document_title_to_id.json', lambda: print('failed')) ordered_rankings_to_eval = [[ document_title_to_id[title] for title in rankings_to_eval[query] ] for query in query_ids] ordered_qrels = [[document_title_to_id[title] for title in qrels[query]] for query in query_ids] document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [ document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids ] tokenizer = Tokenizer( rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces]) tokenized_documents = read_cache( 'tok_docs.json', lambda: tokenizer.process_all(clean_documents(documents))) tokenized_queries = tokenizer.process_all(clean_documents(queries)) bm25 = gensim_bm25.BM25(tokenized_documents) # with open('./caches/106756_most_common_doc.json', 'r') as fh: # doc_token_set = set(json.load(fh)) # corpus, token_lookup = tokens_to_indexes(tokenized_documents, # None, # token_set=doc_token_set) # corpus = [[[token_lookup[term], f] for term, f in doc_fs.items()] for doc_fs in bm25.f] # tfidf = TfidfModel(corpus) # lsi = LsiModel(tfidf, id2word=_.invert(token_lookup), num_topics=300) glove_rankings = [] # lsi_rankings = [] glove = get_glove_lookup(embedding_dim=300, use_large_embed=True) encoded_docs = torch.stack( [encode_glove_fs(glove, bm25.idf, doc_fs) for doc_fs in bm25.f]) encoded_docs = encoded_docs / torch.norm(encoded_docs, dim=1).unsqueeze(1) for q, qml_ranking in progressbar(zip(tokenized_queries, ordered_rankings_to_eval), max_value=len(tokenized_queries)): doc_ids = qml_ranking[:k] if '--rerank' in sys.argv else None glove_rankings.append( rank_glove(glove, bm25.idf, encoded_docs, q, doc_ids=doc_ids)) # lsi_rankings.append(rank_lsi(lsi, tfidf, [token_lookup[term] if term in token_lookup else 0 for term in q], doc_ids=doc_ids)) print('indri:', metrics_at_k(ordered_rankings_to_eval, ordered_qrels, k)) print('glove:', metrics_at_k(glove_rankings, ordered_qrels, k))
def get_preds(user_input, vectorizer, m): tokenizer = Tokenizer() tok = SpacyTokenizer('en') toks = tokenizer.process_text(user_input, tok) term_doc = vectorizer.transform(toks) preds = m.predict(term_doc) toks_preds_df = pd.DataFrame() toks_preds_df['toks'] = toks toks_preds_df['preds'] = preds html_format_list = html_format(toks_preds_df) html_format_join = ' '.join(html_format_list) html_format_join = "<div>" + html_format_join + "</span></div>" return html_format_join
def preprocess_texts(texts, token_lookup=None, num_tokens=None, token_set=None, drop_if_any_unk=False): tokenizer = Tokenizer() tokenized = tokenizer.process_all(texts) idx_texts, token_lookup = tokens_to_indexes( tokenized, token_lookup, num_tokens=num_tokens, token_set=token_set, drop_if_any_unk=drop_if_any_unk) return idx_texts, token_lookup
def get_texts(df): labels = df[0].values.astype(np.int64) texts = f'\n{BOS} {FLD} 1 ' + df[1].astype(str) texts = list(texts.apply(fixup).values) tokens = Tokenizer(lang='en_core_web_sm').proc_all_mp( partition_by_cores(texts), lang='en_core_web_sm') return tokens, list(labels)
def prepare_clas_dataset(input_path, output_dir=None, valid_split=0.2, tokenizer_lang="xx", min_freq=2, seed=42): """ Reads a CSV file with texts and labels, splits it into training and validation sets, tokenizes texts and saves datasets for fine-tuning and for classification. Attributes: input_path (str): Path to CSV file with texts in the first and labels in second column. output_dir (str): Folder where to store the processed dataset. valid_split (float): A fraction of data used for validation. tokenizer_lang (str): Language setting for tokenizer. min_freq (int): Minimal number of occurrences of a word to be conidered for adding to vocabulary. seed (int): Random seed that determines the training-validation split. """ input_path = Path(input_path) output_dir = Path(output_dir or input_path.parent) output_dir.mkdir(parents=True, exist_ok=True) train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed) data_finetune_lm = TextLMDataBunch.from_df( output_dir, train_df, valid_df, tokenizer=Tokenizer(lang=tokenizer_lang), text_cols=0, min_freq=min_freq) data_clas = TextClasDataBunch.from_df( output_dir, train_df, valid_df, tokenizer=Tokenizer(lang=tokenizer_lang), text_cols=0, label_cols=1, vocab=data_finetune_lm.train_ds.vocab, bs=32, min_freq=min_freq) data_finetune_lm.save("data_finetune_lm.pkl") data_clas.save("data_clas.pkl")
def get_german_db(bs=32): tokenizer = Tokenizer(lang='de') tokenizer_proc = TokenizeProcessor(tokenizer=tokenizer) num_proc = NumericalizeProcessor() processor = [tokenizer_proc, num_proc] db = TextList\ .from_csv('/data/10kgerman', 'train.csv', cols=["text"], processor=processor)\ .split_by_rand_pct(0.1)\ .label_from_df('c').databunch() db.batch_size = bs return db
def get_texts_and_tokenize(df, n_lbls=1): labels = df.iloc[:,range(n_lbls)].values labels = labels.tolist() labels = list(map(lambda x: ast.literal_eval(x[0]), labels)) texts = f'\n{BOS} ' + df[n_lbls].astype(str) # perform common fixup to text: texts = list(texts.apply(fixup).values) labels = list(labels) # tokenize texts tok = Tokenizer().proc_all_mp(partition_by_cores(texts)) return texts, tok, labels
def predict(self, text): self.model.reset() self.model.eval() input_string = 'xbos xfld 1 ' + text texts = [input_string] tokens = Tokenizer(lang='en_core_web_sm').proc_all_mp( partition_by_cores(texts), lang='en_core_web_sm') encoded_tokens = [self.inspire_data_stoi[p] for p in tokens[0]] token_array = np.reshape(np.array(encoded_tokens), (-1, 1)) token_array = Variable(torch.from_numpy(token_array)) prediction_scores = self.model(token_array) prediction_scores_numpy = prediction_scores[0].data.cpu().numpy() return numpy_softmax(prediction_scores_numpy[0])[0]
def __init__(self, model_name): super().__init__() setattr(self, 'model_class', AutoModelForSequenceClassification) self.model_name = model_name self.pt_tokenizer, self.pt_config, self.pt_model = [ self.fetch_pretrained(i) for i in [self.tokenizer_class, self.config_class, self.model_class] ] self.base_tokenizer = transformer_tokenizer.TransformersBaseTokenizer( self.model_name, self.pt_tokenizer) self.tokenizer = Tokenizer(tok_func=self.base_tokenizer, pre_rules=[], post_rules=[]) self.vocab = transformer_vocab.TransformersVocab(self.pt_tokenizer) self.model = CustTransformerModel(self.pt_model)
def __init__(self, path): texts = pd.read_csv(path + '/jokes_extended_vk_anekdot_preproc.csv', index_col=0) texts.dropna(inplace=True) data = TextList.from_df(texts, processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")), NumericalizeProcessor(min_freq=2, max_vocab=60000)])\ .split_by_rand_pct(.1)\ .label_for_lm()\ .databunch(bs=64) self.learn = language_model_learner(data=data, arch=AWD_LSTM, pretrained=None) self.learn.load_pretrained(path + '/ulmfit/bestmodel_tune.pth', path + '/ulmfit/bestmodel_tune_itos.pkl')
def evaluate_lm(data_path, model_dir, tokenizer_lang="xx", evaluate_custom_perplexity=False): """ Evaluate metrics of a trained language model using any dataset of texts from CSV file. Attributes: data_path (str): Path to CSV file with texts in the first column. model_dir (str): Directory with a trained language model. tokenizer_lang (str): Language setting for tokenizer. evaluate_custom_perplexity (bool): The perplexity estimated as e^(avg. loss), but the average loss changes slightly with batch size. To get perplexity computed in slower but controlled fashion, set `evaluate_custom_perplexity` to True. Discrepancy between perplexity and custom perplexity is empirically approximately 1%. """ model_dir = Path(model_dir) with open(model_dir / "lm_itos.pkl", "rb") as f: itos = pickle.load(f) data_df = pd.read_csv(data_path, header=None) data = TextLMDataBunch.from_df("", data_df, data_df, text_cols=0, tokenizer=Tokenizer(lang=tokenizer_lang), vocab=Vocab(itos)) with open(model_dir / "model_hparams.json", "r") as model_hparams_file: model_hparams = json.load(model_hparams_file) learner = lm_learner(data, AWD_LSTM, model_dir, pretrained=True, config=model_hparams) loss, acc = learner.validate() print("Loss: {}, Perplexity: {}, Accuracy: {}".format( loss, exp(loss), acc)) if evaluate_custom_perplexity: print( "Custom perplexity: {}, Fraction OOV: {}, OOV perplexity contribution: {}" .format(*evaluate_perplexity(learner, data.valid_ds.x)))
def label_linkable_tokens(article_html, tokenizer=Tokenizer(), label_all=True): parsed_html = BeautifulSoup(article_html, 'html.parser') link_text = [link.text for link in parsed_html.find_all('a')] tokenised_links = tokenizer.process_all(link_text) tokenised_text = tokenizer.process_all([parsed_html.text])[0] target = np.zeros(len(tokenised_text)) for link in tokenised_links: start_positions = kmp(tokenised_text, link) if label_all: for pos in start_positions: target[pos : pos + len(link)] = 1 elif label_all == False and len(start_positions) > 0: pos = start_positions[0] target[pos : pos + len(link)] = 1 else: pass return tokenised_text, target
def prepare_lm_dataset(input_path, output_dir=None, valid_split=0.2, tokenizer_lang="xx", min_freq=2, seed=42): """ Reads CSV file with texts for training language model, splits it into training and validation sets, tokenizes and saves the dataset. Attributes: input_path (str): Path to CSV file where there are texts in the first column. output_dir (str): Folder where to store the processed dataset. valid_split (float): A fraction of data used for validation. tokenizer_lang (str): Language setting for tokenizer. min_freq (int): Minimal number of occurrences of a word to be conidered for adding to vocabulary. seed (int): Random seed that determines the training-validation split. """ input_path = Path(input_path) output_dir = Path(output_dir or input_path.parent) output_dir.mkdir(parents=True, exist_ok=True) train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed) data_lm = TextLMDataBunch.from_df(output_dir, train_df, valid_df, text_cols=0, tokenizer=Tokenizer(lang=tokenizer_lang), min_freq=min_freq) data_lm.save("data_lm.pkl") with open(output_dir / "data_lm_tokenized_train.txt", "w") as f: f.write("\n".join(map(str, list(data_lm.train_ds.x)))) with open(output_dir / "data_lm_tokenized_valid.txt", "w") as f: f.write("\n".join(map(str, list(data_lm.valid_ds.x))))
def get_processor(vocab_class_obj): if conf['use_sentencePiece']: processor = [ OpenFileProcessor(), SPProcessor( sp_model=conf['sentencePiece']['model'], sp_vocab=conf['sentencePiece']['vocab'], max_sentence_len=conf['sentencePiece']['max_sentence_len'], max_vocab_sz=conf['sentencePiece']['max_vocab']) ] else: tokenizer = Tokenizer(tok_func=dna_tokenizer_n_char, pre_rules=[], post_rules=[], special_cases=[], n_cpus=conf['n_workers']) processor = [ TokenizeProcessor(tokenizer=tokenizer, include_bos=True, include_eos=True), NumericalizeProcessor(vocab=vocab_class_obj, max_vocab=conf['max_vocab']) ] return processor, tokenizer
import csv import multiprocessing as mp import typing as t from pathlib import Path import joblib from fastai.text import Tokenizer, SpacyTokenizer, Vocab from tqdm import tqdm # Create a Tokenizer with the default settings for English # including punctuation rules and exceptions tokenizer = Tokenizer() tok = SpacyTokenizer('en') def tokenize(line: str) -> str: doc = ' '.join(tokenizer.process_text(line, tok)) # parse with FastAI tokenizer return doc + '\n' def numericalize(doc: str, vocab: Vocab) -> t.List[int]: return vocab.numericalize(doc) def build_vocab(docs: t.List[str], max_vocab: int = 10000, min_freq: int = 5) -> Vocab: return Vocab.create(docs, max_vocab=max_vocab, min_freq=min_freq) if __name__ == '__main__': data_path = Path('../data/raw/reviews.csv') assert data_path.exists()
def get_texts_gensim(texts: List[str]) -> List[List[str]]: processed_textx = [' '.join(simple_preprocess(t)) for t in texts] tok = Tokenizer().process_all(processed_textx) return tok
def get_texts_fastai(texts: List[str]) -> List[List[str]]: fixed_texts = [fix_html(t) for t in texts] tok = Tokenizer().process_all(fixed_texts) return tok
def _apply_tokenizer(self, text): # Tokenizer creates a list, containing a list of each word pairs, such as #tokens = [[' ', 't_up', 'bos'], ['senjō'], ['no'], [' ', 't_up', 'eos']] tokens = Tokenizer(lang=self.lang).proc_all_mp(partition_by_cores(text.split())) # Flatten nested list return [word for sublist in tokens for word in sublist]
def get_default_tokenizer(): return Tokenizer()
def main(gpu: Param("GPU to run on", str)=None, max_cpu_per_dataloader: Param("Max CPU", int, opt=True)=8, bs: Param("batch size", int)=256, fp16: Param("mixed precision", int, opt=True)=0, use_sp_processor: Param("use sentence piece as processor", int) = 0, sp_model: Param("sentence piece trained model file", str)=None, sp_vocab: Param("sentence piece trained model file", str)=None, ): datetime_str = f'{datetime.now():%Y-%m-%d_%H-%M-%S%z}' random_seed = 0 max_vocab = 30000 print('max_cpu_per_dataloader', max_cpu_per_dataloader, 'bs', bs, 'fp16', fp16, 'sp_processor', use_sp_processor, 'sp_model', sp_model, 'sp_vocab', sp_vocab) """## Prepare Dataset""" local_project_path = './data/sprot_lm/' #### Distributed print('gpu', gpu) gpu = setup_distrib(gpu) n_gpus = num_distrib() if n_gpus > 0: workers = min(max_cpu_per_dataloader, num_cpus()//n_gpus) else: workers = min(max_cpu_per_dataloader, num_cpus()) print(gpu, 'n_gpus', n_gpus) print(gpu, 'workers', workers) """## Prepare fastai""" np.random.seed(random_seed) if not os.path.exists(local_project_path): os.makedirs(local_project_path) print('local_project_path:', local_project_path) """## Tokenization""" tokenizer = Tokenizer(tok_func=dna_tokenizer, pre_rules=[], post_rules=[], special_cases=[]) processor = [TokenizeProcessor(tokenizer=tokenizer, include_bos=True, include_eos=True), NumericalizeProcessor(max_vocab=max_vocab)] df = pickle.load( open('./data/sprot_lm/sprot_sequence_taxon_anc.pickle', 'rb')) if use_sp_processor: # './data/sprot_lm/tmp/spm.model', './data/sprot_lm/tmp/spm.vocab' processor = [OpenFileProcessor(), SPProcessor(sp_model=sp_model, sp_vocab=sp_vocab, max_sentence_len=35826, max_vocab_sz=max_vocab)] data_lm = (TextList.from_df(df, path=local_project_path, cols='seq_anc_tax', processor=processor) .split_by_rand_pct(0.1, seed = random_seed) .label_for_lm() .databunch(bs=bs, num_workers=workers)) data_lm.vocab.save(local_project_path + 'vocab_lm_sprot_seq_anc_tax-' + datetime_str + '.pickle') print('data_cls Training set size', len(data_lm.train_ds)) print('data_cls Validation set size', len(data_lm.valid_ds)) print('vocab size ', len(data_lm.vocab.itos)) learn_lm = language_model_learner( data_lm, AWD_LSTM, drop_mult=0.1, pretrained=False) if gpu is None: print(gpu, 'DataParallel') learn_lm.model = nn.DataParallel(learn_lm.model) else: print(gpu, 'to_distributed') learn_lm.to_distributed(gpu) if fp16: learn_lm.to_fp16() lr = 3e-3 print(gpu, 'freeze') learn_lm.freeze() learn_lm.fit_one_cycle(1, lr, moms=(0.8, 0.7)) # I don't know why multigpu doesn't work without first freezing print(gpu, 'unfreeze') learn_lm.unfreeze() learn_lm.fit_one_cycle(10, lr*10, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-1-' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-1-enc-' + datetime_str) learn_lm.fit_one_cycle(10, lr, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-2-' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-2-enc-' + datetime_str) learn_lm.fit_one_cycle(10, lr/10, moms=(0.8, 0.7)) learn_lm.save('lm-sp-anc-v1-3' + datetime_str) learn_lm.save_encoder('lm-sp-ans-v1-3-enc-' + datetime_str) learn_lm.export(file = 'export-lm-sp-ans-v1-3' + datetime_str+ '.pkl') print('Done')
def main(): global model_to_save global experiment global rabbit rabbit = MyRabbit(args) if rabbit.model_params.dont_limit_num_uniq_tokens: raise NotImplementedError() if rabbit.model_params.frame_as_qa: raise NotImplementedError if rabbit.run_params.drop_val_loss_calc: raise NotImplementedError if rabbit.run_params.use_softrank_influence and not rabbit.run_params.freeze_all_but_last_for_influence: raise NotImplementedError if rabbit.train_params.weight_influence: raise NotImplementedError experiment = Experiment(rabbit.train_params + rabbit.model_params + rabbit.run_params) print('Model name:', experiment.model_name) use_pretrained_doc_encoder = rabbit.model_params.use_pretrained_doc_encoder use_pointwise_loss = rabbit.train_params.use_pointwise_loss query_token_embed_len = rabbit.model_params.query_token_embed_len document_token_embed_len = rabbit.model_params.document_token_embed_len _names = [] if not rabbit.model_params.dont_include_titles: _names.append('with_titles') if rabbit.train_params.num_doc_tokens_to_consider != -1: _names.append('num_doc_toks_' + str(rabbit.train_params.num_doc_tokens_to_consider)) if not rabbit.run_params.just_caches: if rabbit.model_params.dont_include_titles: document_lookup = read_cache(name('./doc_lookup.json', _names), get_robust_documents) else: document_lookup = read_cache(name('./doc_lookup.json', _names), get_robust_documents_with_titles) num_doc_tokens_to_consider = rabbit.train_params.num_doc_tokens_to_consider document_title_to_id = read_cache( './document_title_to_id.json', lambda: create_id_lookup(document_lookup.keys())) with open('./caches/106756_most_common_doc.json', 'r') as fh: doc_token_set = set(json.load(fh)) tokenizer = Tokenizer() tokenized = set( sum( tokenizer.process_all(list( get_robust_eval_queries().values())), [])) doc_token_set = doc_token_set.union(tokenized) use_bow_model = not any([ rabbit.model_params[attr] for attr in ['use_doc_out', 'use_cnn', 'use_lstm', 'use_pretrained_doc_encoder'] ]) use_bow_model = use_bow_model and not rabbit.model_params.dont_use_bow if use_bow_model: documents, document_token_lookup = read_cache( name(f'./docs_fs_tokens_limit_uniq_toks_qrels_and_106756.pkl', _names), lambda: prepare_fs(document_lookup, document_title_to_id, num_tokens=num_doc_tokens_to_consider, token_set=doc_token_set)) if rabbit.model_params.keep_top_uniq_terms is not None: documents = [ dict( nlargest(rabbit.model_params.keep_top_uniq_terms, _.to_pairs(doc), itemgetter(1))) for doc in documents ] else: documents, document_token_lookup = read_cache( name( f'./parsed_docs_{num_doc_tokens_to_consider}_tokens_limit_uniq_toks_qrels_and_106756.json', _names), lambda: prepare(document_lookup, document_title_to_id, num_tokens=num_doc_tokens_to_consider, token_set=doc_token_set)) if not rabbit.run_params.just_caches: train_query_lookup = read_cache('./robust_train_queries.json', get_robust_train_queries) train_query_name_to_id = read_cache( './train_query_name_to_id.json', lambda: create_id_lookup(train_query_lookup.keys())) train_queries, query_token_lookup = read_cache( './parsed_robust_queries_dict.json', lambda: prepare(train_query_lookup, train_query_name_to_id, token_lookup=document_token_lookup, token_set=doc_token_set, drop_if_any_unk=True)) query_tok_to_doc_tok = { idx: document_token_lookup.get(query_token) or document_token_lookup['<unk>'] for query_token, idx in query_token_lookup.items() } names = [RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]] if rabbit.train_params.use_pointwise_loss or not rabbit.run_params.just_caches: train_data = read_cache( name('./robust_train_query_results_tokens_qrels_and_106756.json', names), lambda: read_query_result( train_query_name_to_id, document_title_to_id, train_queries, path='./indri/query_result' + RANKER_NAME_TO_SUFFIX[ rabbit.train_params.ranking_set])) else: train_data = [] q_embed_len = rabbit.model_params.query_token_embed_len doc_embed_len = rabbit.model_params.document_token_embed_len if rabbit.model_params.append_difference or rabbit.model_params.append_hadamard: assert q_embed_len == doc_embed_len, 'Must use same size doc and query embeds when appending diff or hadamard' if q_embed_len == doc_embed_len: glove_lookup = get_glove_lookup( embedding_dim=q_embed_len, use_large_embed=rabbit.model_params.use_large_embed, use_word2vec=rabbit.model_params.use_word2vec) q_glove_lookup = glove_lookup doc_glove_lookup = glove_lookup else: q_glove_lookup = get_glove_lookup( embedding_dim=q_embed_len, use_large_embed=rabbit.model_params.use_large_embed, use_word2vec=rabbit.model_params.use_word2vec) doc_glove_lookup = get_glove_lookup( embedding_dim=doc_embed_len, use_large_embed=rabbit.model_params.use_large_embed, use_word2vec=rabbit.model_params.use_word2vec) num_query_tokens = len(query_token_lookup) num_doc_tokens = len(document_token_lookup) doc_encoder = None if use_pretrained_doc_encoder or rabbit.model_params.use_doc_out: doc_encoder, document_token_embeds = get_doc_encoder_and_embeddings( document_token_lookup, rabbit.model_params.only_use_last_out) if rabbit.model_params.use_glove: query_token_embeds_init = init_embedding(q_glove_lookup, query_token_lookup, num_query_tokens, query_token_embed_len) else: query_token_embeds_init = from_doc_to_query_embeds( document_token_embeds, document_token_lookup, query_token_lookup) if not rabbit.train_params.dont_freeze_pretrained_doc_encoder: dont_update(doc_encoder) if rabbit.model_params.use_doc_out: doc_encoder = None else: document_token_embeds = init_embedding(doc_glove_lookup, document_token_lookup, num_doc_tokens, document_token_embed_len) if rabbit.model_params.use_single_word_embed_set: query_token_embeds_init = document_token_embeds else: query_token_embeds_init = init_embedding(q_glove_lookup, query_token_lookup, num_query_tokens, query_token_embed_len) if not rabbit.train_params.dont_freeze_word_embeds: dont_update(document_token_embeds) dont_update(query_token_embeds_init) else: do_update(document_token_embeds) do_update(query_token_embeds_init) if rabbit.train_params.add_rel_score: query_token_embeds, additive = get_additive_regularized_embeds( query_token_embeds_init) rel_score = RelScore(query_token_embeds, document_token_embeds, rabbit.model_params, rabbit.train_params) else: query_token_embeds = query_token_embeds_init additive = None rel_score = None eval_query_lookup = get_robust_eval_queries() eval_query_name_document_title_rels = get_robust_rels() test_query_names = [] val_query_names = [] for query_name in eval_query_lookup: if len(val_query_names) >= 50: test_query_names.append(query_name) else: val_query_names.append(query_name) test_query_name_document_title_rels = _.pick( eval_query_name_document_title_rels, test_query_names) test_query_lookup = _.pick(eval_query_lookup, test_query_names) test_query_name_to_id = create_id_lookup(test_query_lookup.keys()) test_queries, __ = prepare(test_query_lookup, test_query_name_to_id, token_lookup=query_token_lookup) eval_ranking_candidates = read_query_test_rankings( './indri/query_result_test' + RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]) test_candidates_data = read_query_result( test_query_name_to_id, document_title_to_id, dict(zip(range(len(test_queries)), test_queries)), path='./indri/query_result_test' + RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]) test_ranking_candidates = process_raw_candidates(test_query_name_to_id, test_queries, document_title_to_id, test_query_names, eval_ranking_candidates) test_data = process_rels(test_query_name_document_title_rels, document_title_to_id, test_query_name_to_id, test_queries) val_query_name_document_title_rels = _.pick( eval_query_name_document_title_rels, val_query_names) val_query_lookup = _.pick(eval_query_lookup, val_query_names) val_query_name_to_id = create_id_lookup(val_query_lookup.keys()) val_queries, __ = prepare(val_query_lookup, val_query_name_to_id, token_lookup=query_token_lookup) val_candidates_data = read_query_result( val_query_name_to_id, document_title_to_id, dict(zip(range(len(val_queries)), val_queries)), path='./indri/query_result_test' + RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]) val_ranking_candidates = process_raw_candidates(val_query_name_to_id, val_queries, document_title_to_id, val_query_names, eval_ranking_candidates) val_data = process_rels(val_query_name_document_title_rels, document_title_to_id, val_query_name_to_id, val_queries) train_normalized_score_lookup = read_cache( name('./train_normalized_score_lookup.pkl', names), lambda: get_normalized_score_lookup(train_data)) test_normalized_score_lookup = get_normalized_score_lookup( test_candidates_data) val_normalized_score_lookup = get_normalized_score_lookup( val_candidates_data) if use_pointwise_loss: normalized_train_data = read_cache( name('./normalized_train_query_data_qrels_and_106756.json', names), lambda: normalize_scores_query_wise(train_data)) collate_fn = lambda samples: collate_query_samples( samples, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) train_dl = build_query_dataloader( documents, normalized_train_data, rabbit.train_params, rabbit.model_params, cache=name('train_ranking_qrels_and_106756.json', names), limit=10, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=train_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=False) test_dl = build_query_dataloader( documents, test_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=test_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) val_dl = build_query_dataloader( documents, val_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) model = PointwiseScorer(query_token_embeds, document_token_embeds, doc_encoder, rabbit.model_params, rabbit.train_params) else: if rabbit.train_params.use_noise_aware_loss: ranker_query_str_to_rankings = get_ranker_query_str_to_rankings( train_query_name_to_id, document_title_to_id, train_queries, limit=rabbit.train_params.num_snorkel_train_queries) query_names = reduce( lambda acc, query_to_ranking: acc.intersection( set(query_to_ranking.keys())) if len(acc) != 0 else set(query_to_ranking.keys()), ranker_query_str_to_rankings.values(), set()) all_ranked_lists_by_ranker = _.map_values( ranker_query_str_to_rankings, lambda query_to_ranking: [query_to_ranking[query] for query in query_names]) ranker_query_str_to_pairwise_bins = get_ranker_query_str_to_pairwise_bins( train_query_name_to_id, document_title_to_id, train_queries, limit=rabbit.train_params.num_train_queries) snorkeller = Snorkeller(ranker_query_str_to_pairwise_bins) snorkeller.train(all_ranked_lists_by_ranker) calc_marginals = snorkeller.calc_marginals else: calc_marginals = None collate_fn = lambda samples: collate_query_pairwise_samples( samples, use_bow_model=use_bow_model, calc_marginals=calc_marginals, use_dense=rabbit.model_params.use_dense) if rabbit.run_params.load_influences: try: with open(rabbit.run_params.influences_path) as fh: pairs_to_flip = defaultdict(set) for pair, influence in json.load(fh): if rabbit.train_params.use_pointwise_loss: condition = True else: condition = influence < rabbit.train_params.influence_thresh if condition: query = tuple(pair[1]) pairs_to_flip[query].add(tuple(pair[0])) except FileNotFoundError: pairs_to_flip = None else: pairs_to_flip = None train_dl = build_query_pairwise_dataloader( documents, train_data, rabbit.train_params, rabbit.model_params, pairs_to_flip=pairs_to_flip, cache=name('train_ranking_qrels_and_106756.json', names), limit=10, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=train_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=False) test_dl = build_query_pairwise_dataloader( documents, test_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=test_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) val_dl = build_query_pairwise_dataloader( documents, val_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) val_rel_dl = build_query_pairwise_dataloader( documents, val_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True, rel_vs_irrel=True, candidates=val_ranking_candidates, num_to_rank=rabbit.run_params.num_to_rank) model = PairwiseScorer(query_token_embeds, document_token_embeds, doc_encoder, rabbit.model_params, rabbit.train_params, use_bow_model=use_bow_model) train_ranking_dataset = RankingDataset( documents, train_dl.dataset.rankings, rabbit.train_params, rabbit.model_params, rabbit.run_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=train_normalized_score_lookup, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) test_ranking_dataset = RankingDataset( documents, test_ranking_candidates, rabbit.train_params, rabbit.model_params, rabbit.run_params, relevant=test_dl.dataset.rankings, query_tok_to_doc_tok=query_tok_to_doc_tok, cheat=rabbit.run_params.cheat, normalized_score_lookup=test_normalized_score_lookup, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) val_ranking_dataset = RankingDataset( documents, val_ranking_candidates, rabbit.train_params, rabbit.model_params, rabbit.run_params, relevant=val_dl.dataset.rankings, query_tok_to_doc_tok=query_tok_to_doc_tok, cheat=rabbit.run_params.cheat, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) if rabbit.train_params.memorize_test: train_dl = test_dl train_ranking_dataset = test_ranking_dataset model_data = DataBunch(train_dl, val_rel_dl, test_dl, collate_fn=collate_fn, device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) multi_objective_model = MultiObjective(model, rabbit.train_params, rel_score, additive) model_to_save = multi_objective_model if rabbit.train_params.memorize_test: try: del train_data except: pass if not rabbit.run_params.just_caches: del document_lookup del train_query_lookup del query_token_lookup del document_token_lookup del train_queries try: del glove_lookup except UnboundLocalError: del q_glove_lookup del doc_glove_lookup if rabbit.run_params.load_model: try: multi_objective_model.load_state_dict( torch.load(rabbit.run_params.load_path)) except RuntimeError: dp = nn.DataParallel(multi_objective_model) dp.load_state_dict(torch.load(rabbit.run_params.load_path)) multi_objective_model = dp.module else: train_model(multi_objective_model, model_data, train_ranking_dataset, val_ranking_dataset, test_ranking_dataset, rabbit.train_params, rabbit.model_params, rabbit.run_params, experiment) if rabbit.train_params.fine_tune_on_val: fine_tune_model_data = DataBunch( val_rel_dl, val_rel_dl, test_dl, collate_fn=collate_fn, device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) train_model(multi_objective_model, fine_tune_model_data, val_ranking_dataset, val_ranking_dataset, test_ranking_dataset, rabbit.train_params, rabbit.model_params, rabbit.run_params, experiment, load_path=rabbit.run_params.load_path) multi_objective_model.eval() device = model_data.device gpu_multi_objective_model = multi_objective_model.to(device) if rabbit.run_params.calc_influence: if rabbit.run_params.freeze_all_but_last_for_influence: last_layer_idx = _.find_last_index( multi_objective_model.model.pointwise_scorer.layers, lambda layer: isinstance(layer, nn.Linear)) to_last_layer = lambda x: gpu_multi_objective_model( *x, to_idx=last_layer_idx) last_layer = gpu_multi_objective_model.model.pointwise_scorer.layers[ last_layer_idx] diff_wrt = [p for p in last_layer.parameters() if p.requires_grad] else: diff_wrt = None test_hvps = calc_test_hvps( multi_objective_model.loss, gpu_multi_objective_model, DeviceDataLoader(train_dl, device, collate_fn=collate_fn), val_rel_dl, rabbit.run_params, diff_wrt=diff_wrt, show_progress=True, use_softrank_influence=rabbit.run_params.use_softrank_influence) influences = [] if rabbit.train_params.use_pointwise_loss: num_real_samples = len(train_dl.dataset) else: num_real_samples = train_dl.dataset._num_pos_pairs if rabbit.run_params.freeze_all_but_last_for_influence: _sampler = SequentialSamplerWithLimit(train_dl.dataset, num_real_samples) _batch_sampler = BatchSampler(_sampler, rabbit.train_params.batch_size, False) _dl = DataLoader(train_dl.dataset, batch_sampler=_batch_sampler, collate_fn=collate_fn) sequential_train_dl = DeviceDataLoader(_dl, device, collate_fn=collate_fn) influences = calc_dataset_influence(gpu_multi_objective_model, to_last_layer, sequential_train_dl, test_hvps, sum_p=True).tolist() else: for i in progressbar(range(num_real_samples)): train_sample = train_dl.dataset[i] x, labels = to_device(collate_fn([train_sample]), device) device_train_sample = (x, labels.squeeze()) influences.append( calc_influence(multi_objective_model.loss, gpu_multi_objective_model, device_train_sample, test_hvps, diff_wrt=diff_wrt).sum().tolist()) with open(rabbit.run_params.influences_path, 'w+') as fh: json.dump([[train_dl.dataset[idx][1], influence] for idx, influence in enumerate(influences)], fh)
import pandas as pd import numpy as np import os import pickle from pathlib import Path from sklearn.utils import Bunch from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from fastai.text import Tokenizer from eda import EDA tok = Tokenizer() def extract_features(df, out_path, max_features=30000, vectorizer=None): reviews = df['reviewText'].tolist() tok_reviews = tok.process_all(reviews) if vectorizer is None: vectorizer = TfidfVectorizer(max_features=max_features, preprocessor=lambda x: x, tokenizer = lambda x: x, min_df=5) X = vectorizer.fit_transform(tok_reviews) else: X = vectorizer.transform(tok_reviews) featset = Bunch(X=X, y=df.overall.values) pickle.dump(featset, open(out_path, 'wb')) return vectorizer