def query(): w = pickle.load(open('weights_from_query.pkl', 'rb')).squeeze() topk_vals, topk_idxs = torch.topk(w, 30) bottomk_vals, bottomk_idxs = torch.topk(-w, 30) docs, lookup = pickle.load(open('parsed_robust_queries.pkl', 'rb')) tf, df, idf = count_me(docs) inv_lookup = _.invert(lookup) print('Top30: ', [inv_lookup[idx] for idx in topk_idxs.tolist()]) print('Bottom30: ', [inv_lookup[idx] for idx in bottomk_idxs.tolist()]) glove = get_glove_lookup() glove_by_idx = _.map_keys( glove, lambda vec, token: lookup[token] if token in lookup else lookup['<unk>']) norms_by_idx = _.map_values(glove_by_idx, torch.norm) idxs_in_order = list(norms_by_idx.keys()) idfs_in_order = torch.tensor([idf[idx] for idx in idxs_in_order]) dfs_in_order = torch.tensor([df[idx] for idx in idxs_in_order]) tfs_in_order = torch.tensor([tf[idx] for idx in idxs_in_order]) norms_in_order = torch.tensor([norms_by_idx[idx] for idx in idxs_in_order]) w_subset = w[torch.tensor(idxs_in_order)] print(np.corrcoef(w_subset, tfs_in_order)[0, 1]) print(np.corrcoef(w_subset, dfs_in_order)[0, 1]) print(np.corrcoef(w_subset, idfs_in_order)[0, 1]) print(np.corrcoef(w_subset, norms_in_order)[0, 1]) print(np.corrcoef(w_subset, np.log(tfs_in_order + 1))[0, 1]) print(np.corrcoef(w_subset, np.log(dfs_in_order))[0, 1]) print(np.corrcoef(w_subset, np.log(idfs_in_order))[0, 1]) print(np.corrcoef(w_subset, np.log(norms_in_order + 1))[0, 1])
def check_fleet_config( fleet: Any, is_setup_stage=False) -> Tuple[bool, Optional[Dict[int, int]]]: """ Проверяет конфигурацию флота (1 4палубный, 2 3палубных итд) :param fleet: список кораблей :param is_setup_stage: Если True, то отключает проверку на недостающие корабли (считается, что поле находится в процессе заполнения и игрок еще не выставил все корабли) :return: (bool, [dict]) Если флот собран корректно, возвращает True, None Если есть лишние корабли, возвращает False, None Если кораблей не хватает, возвращает True или False (в зависимости от is_setup_stage) и список слотов """ lengths = map(len, fleet) config = Counter(lengths) if config == SHIP_CONFIG: return True, None # Checking for extra ships configs = group_by_keys((config, SHIP_CONFIG), 0) diff = py_.map_values(configs, lambda counts: counts[1] - counts[0]) extra_ships = any(py_.map_(list(diff.values()), lambda x: x < 0)) if extra_ships: return False, None missing_ships = {k: v for k, v in diff.items() if v > 0} if missing_ships: return is_setup_stage, missing_ships
def get_query_str_to_pairwise_bins(query_name_to_id, document_title_to_id, queries, path, limit=None): pairwise_bins_by_query = defaultdict(list) with open(path) as fh: while True: if limit is not None and len(pairwise_bins_by_query) >= limit: break line = fh.readline() if line: query_name, __, doc_title, __, __, ___ = line.strip().split( ' ') if query_name not in query_name_to_id: continue if doc_title not in document_title_to_id: continue query_id = query_name_to_id[query_name] if query_id not in queries: query_id = str(query_id) if query_id not in queries: continue pairwise_bins_by_query[str(queries[query_id])[1:-1]].append( document_title_to_id[doc_title]) else: break result = _.map_values(dict(pairwise_bins_by_query), get_pairwise_bins) return defaultdict(lambda: (set(), set()), result)
def _drop_overlapping_mentions(link_contexts): entity_mention_pairs = sum(_.map_values(link_contexts, lambda mentions, entity: [[entity, mention] for mention in mentions]).values(), []) __, reduced_link_contexts = reduce(_drop_overlapping_mentions_reducer, entity_mention_pairs, ([], {})) return reduced_link_contexts
def problem2(): lines = read_input() sleeps = parse(lines) sleep_maps = pydash.map_values(sleeps, sleep_map) sorted_minutes = list( sorted(flatten_minutes(sleep_maps), key=lambda s: s[2])) best = sorted_minutes[-1] print(best[0] * best[1])
def main(): load_dotenv(dotenv_path='.env') EL_DATABASE_NAME = os.getenv("EL_DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.SSDictCursor) try: with connection.cursor() as cursor: cursor.execute("SET NAMES utf8mb4;") cursor.execute("SET CHARACTER SET utf8mb4;") cursor.execute("SET character_set_connection=utf8mb4;") cursor.execute( 'select mention, entity_id from entity_mentions_text') candidates_prior = defaultdict(lambda: defaultdict(int)) entity_labels = {} for row in progressbar(cursor): if row['entity_id'] not in entity_labels: entity_labels[row['entity_id']] = len(entity_labels) entity_label = entity_labels[row['entity_id']] candidates_prior[row['mention']][entity_label] += 1 cursor.execute( 'select distinct entity_id, entity from entity_mentions_text') for row in progressbar(cursor): if row['entity_id'] not in entity_labels: entity_labels[row['entity_id']] = len(entity_labels) entity_label = entity_labels[row['entity_id']] candidates_prior[row['entity']][entity_label] += 1 cursor.execute( 'select distinct preredirect, entity_id from mentions m join entity_mentions em on em.mention_id = m.id' ) for row in progressbar(cursor): if row['entity_id'] not in entity_labels: entity_labels[row['entity_id']] = len(entity_labels) entity_label = entity_labels[row['entity_id']] candidates_prior[row['preredirect']][entity_label] += 1 with open('lookups.pkl', 'wb') as lookup_file: pickle.dump( { 'lookups': { 'entity_candidates_prior': _.map_values(dict(candidates_prior), dict), 'entity_labels': entity_labels }, 'train_size': 1.0 }, lookup_file) finally: connection.close()
def main(): load_dotenv(dotenv_path='.env') EL_DATABASE_NAME = os.getenv("EL_DBNAME") DATABASE_USER = os.getenv("DBUSER") DATABASE_PASSWORD = os.getenv("DBPASS") DATABASE_HOST = os.getenv("DBHOST") connection = pymysql.connect(host=DATABASE_HOST, user=DATABASE_USER, password=DATABASE_PASSWORD, db=EL_DATABASE_NAME, charset='utf8mb4', use_unicode=True, cursorclass=pymysql.cursors.DictCursor) try: with connection.cursor() as cursor: cursor.execute("SET NAMES utf8mb4;") cursor.execute("SET CHARACTER SET utf8mb4;") cursor.execute("SET character_set_connection=utf8mb4;") cursor.execute('select mention, entity_id, page_id from entity_mentions_text') candidates_prior = defaultdict(lambda: defaultdict(int)) entity_labels = {} train_size = 0.8 try: with open('./page_id_order.pkl', 'rb') as f: page_id_order = pickle.load(f) except Exception as e: raise type(e)(str(e) + '\n' + 'Create `page_id_order.pkl` by running `create_page_id_order.py`').with_traceback(sys.exc_info()[2]) num_train_pages = int(len(page_id_order) * train_size) train_page_id_order = page_id_order[:num_train_pages] for row in cursor.fetchall(): if row['entity_id'] not in entity_labels: entity_labels[row['entity_id']] = len(entity_labels) if row['page_id'] not in train_page_id_order: continue entity_label = entity_labels[row['entity_id']] candidates_prior[row['mention']][entity_label] += 1 cursor.execute('select distinct entity_id, entity from entity_mentions_text') for row in cursor.fetchall(): if row['entity_id'] not in entity_labels: entity_labels[row['entity_id']] = len(entity_labels) entity_label = entity_labels[row['entity_id']] candidates_prior[row['entity']][entity_label] += 1 cursor.execute('select distinct preredirect, entity_id from mentions m join entity_mentions em on em.mention_id = m.id') for row in cursor.fetchall(): if row['entity_id'] not in entity_labels: entity_labels[row['entity_id']] = len(entity_labels) entity_label = entity_labels[row['entity_id']] candidates_prior[row['preredirect']][entity_label] += 1 with open('lookups.pkl', 'wb') as lookup_file: pickle.dump({'lookups': {'entity_candidates_prior': _.map_values(dict(candidates_prior), dict), 'entity_labels': entity_labels}, 'train_size': train_size}, lookup_file) finally: connection.close()
def process_raw_candidates(query_name_to_id, queries, document_title_to_id, query_names, raw_ranking_candidates): ranking_candidates = _.pick(raw_ranking_candidates, query_names) lookup_by_title = lambda title: document_title_to_id.get(title) or 0 test_ranking_candidates = _.map_values( ranking_candidates, lambda candidate_names: _.map_(candidate_names, lookup_by_title)) return _.map_keys( test_ranking_candidates, lambda ranking, query_name: str(queries[ query_name_to_id[query_name]])[1:-1])
def call_from_cli(self, command, args, verbose=True): command = command.replace('-', '_') if Component.instance: self = Component.instance if hasattr(self, command): command = getattr(self, command) method_container = command if hasattr(command, 'redirect'): method_container = command.redirect params = inspect.signature(method_container).parameters.values() is_static = isinstance( method_container, types.FunctionType) and list(params)[0].name != 'self' if has_named_args(args): named = named_args_as_positional(args, params, self.name, method_container.__name__) args = named['args'] named['properties'] = _.map_values(named['properties'], string_to_any) self.init(named['properties']) last_param = list(params)[-1] is_consuming_rest = last_param.kind == last_param.VAR_POSITIONAL if not is_consuming_rest and len(args) > len(params): result = { 'error': f'Wrong number of arguments passed to {command.__name__}: expected {len(params)} instead of {len(args)}.' } else: args = list(map(string_to_any, args)) if not is_static: args.insert(0, self) self.called_from_cli = True command.as_cli = True result = command(*args) else: result = {'error': f'{self.name} has no method {command}.'} if verbose: if result: print(respond(result)) else: return result
def merge_mentions(processed_pages): '''merge the link contexts from a list of pages''' concat = lambda dest, src: dest + src if dest else src link_contexts = reduce(lambda acc, val: _.merge_with(acc, val, iteratee=concat), [processed_page['link_contexts'] for processed_page in processed_pages], {}) entity_counts = reduce(lambda acc, val: _.merge_with(acc, val, iteratee=concat), [processed_page['entity_counts'] for processed_page in processed_pages], {}) return _.map_values(link_contexts, lambda val, key: {'link_contexts': val, 'entity_counts': entity_counts[key]})
def process_page(redirects_lookup, page, is_seed_page=False): cleaned_page = clean_page(page) document_info = {'source_id': cleaned_page['pageID'], 'title': cleaned_page['title'], 'text': cleaned_page['plaintext'], 'categories': cleaned_page['categories'], 'is_disambiguation_page': cleaned_page['isDisambiguation'], 'is_seed_page': is_seed_page} link_contexts = get_link_contexts_using_heuristics(redirects_lookup, cleaned_page) entity_counts = _.map_values(link_contexts, len) return {'document_info': document_info, 'link_contexts': link_contexts, 'entity_counts': entity_counts}
def test_process_page(): with open('test/fixtures/parade_page.json') as f: parade_page = json.load(f) with open('test/fixtures/parade_page_contexts.json') as f: parade_page_contexts = json.load(f) redirects_lookup = {} processed_page = pp.process_page(redirects_lookup, parade_page) assert processed_page['document_info']['title'] == parade_page['title'] assert processed_page['document_info']['text'] == parade_page['plaintext'] assert processed_page['document_info']['categories'] == parade_page[ 'categories'] assert processed_page['link_contexts'] == parade_page_contexts assert processed_page['entity_counts'] == _.map_values( parade_page_contexts, len)
def test_get_page_iobes(): with open('test/fixtures/parade_page_db.json') as f: parade_page = json.load(f) with open('test/fixtures/parade_page_contexts.json') as f: filter_out_of_bounds = lambda mention: mention['offset'] < len(parade_page['content']) parade_page_contexts = _.map_values(json.load(f), lambda mentions: list(filter(filter_out_of_bounds, mentions))) context_pairs = _.mapcat(_.to_pairs(parade_page_contexts), lambda pair: [[pair[0], mention] for mention in pair[1]]) contexts = _.sort_by(context_pairs, lambda title_mention: title_mention[1]['offset']) mentions = _.flat_map(contexts, _.last) mention_link_titles = list(map(_.head, contexts)) assert parade_iobes == iobes.get_page_iobes(parade_page, mentions, mention_link_titles)
def __init__(self, cursor, page_id_order, entity_candidates_prior, entity_label_lookup, embedding, token_idx_lookup, batch_size, num_entities, num_candidates, cheat=False, buffer_scale=1, min_mentions=1, use_fast_sampler=False, use_wiki2vec=False, start_from_page_num=0): self.page_id_order = page_id_order self.entity_candidates_prior = entity_candidates_prior self.entity_label_lookup = _.map_values(entity_label_lookup, torch.tensor) self.entity_id_lookup = { int(label): entity_id for entity_id, label in self.entity_label_lookup.items() } self.embedding = embedding self.token_idx_lookup = token_idx_lookup self.cursor = cursor self.batch_size = batch_size self.num_entities = num_entities self.num_candidates = num_candidates self._sentence_spans_lookup = {} self._page_content_lookup = {} self._embedded_page_content_lookup = {} self._entity_page_mentions_lookup = {} self._mentions_per_page_ctr = {} self._mention_infos = {} self._candidate_strs_lookup = {} self._bag_of_nouns_lookup = {} self.page_ctr = start_from_page_num self.cheat = cheat self.buffer_scale = buffer_scale self.min_mentions = min_mentions self.use_fast_sampler = use_fast_sampler self.use_wiki2vec = use_wiki2vec # if self.use_fast_sampler: assert not self.use_wiki2vec, 'train wiki2vec locally' if self.min_mentions > 1: query = 'select id from entities where num_mentions >= ' + str( self.min_mentions) cursor.execute(query) self.valid_entity_ids = set(row['id'] for row in cursor.fetchall())
def get_votes_by_bill(leg_id_to_pol_id, state_votes_path='data/statehvotes.json') -> Dict[str, Dict[str, Set[int]]]: def _split_for_against_abstain(bill_info): leg_ids_to_votes = bill_info['votes'] vote_for, vote_against, vote_abstain = set(), set(), set() for leg_id, vote in leg_ids_to_votes.items(): if leg_id not in leg_id_to_pol_id: continue if vote == 'yes': vote_for.add(leg_id_to_pol_id[leg_id]) elif vote == 'no': vote_against.add(leg_id_to_pol_id[leg_id]) else: vote_abstain.add(leg_id_to_pol_id[leg_id]) return {'for': vote_for, 'against': vote_against, 'abstain': vote_abstain} with open(state_votes_path) as fh: votes = json.load(fh) return _.map_values(votes, _split_for_against_abstain)
def _read_attributes(self): ''' Returns ------- dict Value of each attribute in remote context. For each attribute, if type is not supported (i.e., not a plain old data type), value is set to ``None``. See also -------- :meth:`_write_attribute` ''' return py_.map_values(self._attributes, lambda v, k: self._read_attribute(k, None))
def test_process_page_with_redirects(): with open('test/fixtures/parade_page.json') as f: parade_page = json.load(f) with open('test/fixtures/parade_page_contexts.json') as f: parade_page_contexts = json.load(f) redirects_lookup = {"Fort de Goede Hoop": "Kaapstad"} processed_page = pp.process_page(redirects_lookup, parade_page) assert processed_page['document_info']['title'] == parade_page['title'] assert processed_page['document_info']['text'] == parade_page['plaintext'] assert processed_page['document_info']['categories'] == parade_page[ 'categories'] parade_page_contexts["Kaapstad"].insert( 1, parade_page_contexts.pop("Fort de Goede Hoop")[0]) assert processed_page['link_contexts'] == parade_page_contexts assert processed_page['entity_counts'] == _.map_values( parade_page_contexts, len)
def count_me(docs): tf = {0: 0, 1: 0} df = {0: 1, 1: 1} for doc in docs: in_doc = set() for idx in doc: in_doc.add(idx) if idx in tf: tf[idx] += 1 else: tf[idx] = 1 for idx in in_doc: if idx in df: df[idx] += 1 else: df[idx] = 1 idf = _.map_values(df, lambda cnt: 1 / cnt) return tf, df, idf
def nest(collection, *properties): """This method is like :func:`group_by` except that it supports nested grouping by multiple string `properties`. If only a single key is given, it is like calling ``group_by(collection, prop)``. Args: collection (list|dict): Collection to iterate over. *properties (str): Properties to nest by. Returns: dict: Results of nested grouping by `properties`. Example: >>> results = nest([{'shape': 'square', 'color': 'red', 'qty': 5},\ {'shape': 'square', 'color': 'blue', 'qty': 10},\ {'shape': 'square', 'color': 'orange', 'qty': 5},\ {'shape': 'circle', 'color': 'yellow', 'qty': 5},\ {'shape': 'circle', 'color': 'pink', 'qty': 10},\ {'shape': 'oval', 'color': 'purple', 'qty': 5}],\ 'shape', 'qty') >>> expected = {\ 'square': {5: [{'shape': 'square', 'color': 'red', 'qty': 5},\ {'shape': 'square', 'color': 'orange', 'qty': 5}],\ 10: [{'shape': 'square', 'color': 'blue', 'qty': 10}]},\ 'circle': {5: [{'shape': 'circle', 'color': 'yellow', 'qty': 5}],\ 10: [{'shape': 'circle', 'color': 'pink', 'qty': 10}]},\ 'oval': {5: [{'shape': 'oval', 'color': 'purple', 'qty': 5}]}} >>> results == expected True .. versionadded:: 4.3.0 """ if not properties: return collection properties = pyd.flatten(properties) first, rest = properties[0], properties[1:] return pyd.map_values(group_by(collection, first), lambda value: nest(value, *rest))
def test_embed_page_content(): embedding_dict = _.map_values( { '<PAD>': [-1], '<UNK>': [0], 'MENTION_START_HERE': [-2], 'MENTION_END_HERE': [-3], 'a': [1], 'b': [2], 'c': [3], 'd': [4] }, torch.tensor) token_idx_lookup = dict( zip(embedding_dict.keys(), range(len(embedding_dict)))) embedding = nn.Embedding.from_pretrained( torch.stack([embedding_dict[token] for token in token_idx_lookup])) page_mention_infos = [{'offset': 2, 'mention': 'b c'}] page_content = 'a b c d' embedded = torch.tensor([[1], [-2], [2], [3], [-3], [4]]) assert torch.equal( dt.embed_page_content(embedding, token_idx_lookup, page_content, page_mention_infos), embedded)
def __init__(self, cursor, page_id_order, entity_candidates_prior, entity_label_lookup, embedding, token_idx_lookup, batch_size, num_entities, num_candidates, entity_embeds, cheat=False, buffer_scale=1, min_mentions=1, use_fast_sampler=False, use_wiki2vec=False, use_sum_encoder=False, start_from_page_num=0, ablation=['local_context', 'document_context', 'prior']): self._candidate_strs_lookup = read_cache( './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor)) self.page_id_order = page_id_order self.entity_candidates_prior = entity_candidates_prior self.entity_label_lookup = _.map_values(entity_label_lookup, torch.tensor) self.entity_id_lookup = { int(label): entity_id for entity_id, label in self.entity_label_lookup.items() } self.embedding = embedding self.token_idx_lookup = token_idx_lookup self.cursor = cursor self.batch_size = batch_size self.num_entities = num_entities self.num_candidates = num_candidates self._sentence_spans_lookup = {} self._page_content_lookup = {} self._embedded_page_content_lookup = {} self._page_token_cnts_lookup = {} self._entity_page_mentions_lookup = {} self._mentions_per_page_ctr = defaultdict(int) self._mention_infos = {} self._bag_of_nouns_lookup = {} self.page_ctr = start_from_page_num self.cheat = cheat self.buffer_scale = buffer_scale self.min_mentions = min_mentions self.use_fast_sampler = use_fast_sampler self.use_wiki2vec = use_wiki2vec self.use_sum_encoder = use_sum_encoder # if self.use_fast_sampler: assert not self.use_wiki2vec, 'train wiki2vec locally' self.prior_approx_mapping = u.get_prior_approx_mapping( self.entity_candidates_prior) self.page_content_lim = 5000 if self.min_mentions > 1: query = 'select id from entities where num_mentions >= ' + str( self.min_mentions) cursor.execute(query) self.valid_entity_ids = set(row['id'] for row in cursor.fetchall()) self.ablation = ablation self.entity_embeds = entity_embeds self._offset = 0 with open('./entity_to_row_id.pkl', 'rb') as fh: entity_id_to_row = pickle.load(fh) self.token_ctr_by_entity_id = DocLookup('./desc_unstemmed_fs.npz', entity_id_to_row, default_value={1: 1}, use_default=True) self.to_entity_id = read_cache( './page_to_entity_id.pkl', lambda: get_page_id_to_entity_id_lookup(cursor))
def main(): global model_to_save global experiment global rabbit rabbit = MyRabbit(args) if rabbit.model_params.dont_limit_num_uniq_tokens: raise NotImplementedError() if rabbit.model_params.frame_as_qa: raise NotImplementedError if rabbit.run_params.drop_val_loss_calc: raise NotImplementedError if rabbit.run_params.use_softrank_influence and not rabbit.run_params.freeze_all_but_last_for_influence: raise NotImplementedError if rabbit.train_params.weight_influence: raise NotImplementedError experiment = Experiment(rabbit.train_params + rabbit.model_params + rabbit.run_params) print('Model name:', experiment.model_name) use_pretrained_doc_encoder = rabbit.model_params.use_pretrained_doc_encoder use_pointwise_loss = rabbit.train_params.use_pointwise_loss query_token_embed_len = rabbit.model_params.query_token_embed_len document_token_embed_len = rabbit.model_params.document_token_embed_len _names = [] if not rabbit.model_params.dont_include_titles: _names.append('with_titles') if rabbit.train_params.num_doc_tokens_to_consider != -1: _names.append('num_doc_toks_' + str(rabbit.train_params.num_doc_tokens_to_consider)) if not rabbit.run_params.just_caches: if rabbit.model_params.dont_include_titles: document_lookup = read_cache(name('./doc_lookup.json', _names), get_robust_documents) else: document_lookup = read_cache(name('./doc_lookup.json', _names), get_robust_documents_with_titles) num_doc_tokens_to_consider = rabbit.train_params.num_doc_tokens_to_consider document_title_to_id = read_cache( './document_title_to_id.json', lambda: create_id_lookup(document_lookup.keys())) with open('./caches/106756_most_common_doc.json', 'r') as fh: doc_token_set = set(json.load(fh)) tokenizer = Tokenizer() tokenized = set( sum( tokenizer.process_all(list( get_robust_eval_queries().values())), [])) doc_token_set = doc_token_set.union(tokenized) use_bow_model = not any([ rabbit.model_params[attr] for attr in ['use_doc_out', 'use_cnn', 'use_lstm', 'use_pretrained_doc_encoder'] ]) use_bow_model = use_bow_model and not rabbit.model_params.dont_use_bow if use_bow_model: documents, document_token_lookup = read_cache( name(f'./docs_fs_tokens_limit_uniq_toks_qrels_and_106756.pkl', _names), lambda: prepare_fs(document_lookup, document_title_to_id, num_tokens=num_doc_tokens_to_consider, token_set=doc_token_set)) if rabbit.model_params.keep_top_uniq_terms is not None: documents = [ dict( nlargest(rabbit.model_params.keep_top_uniq_terms, _.to_pairs(doc), itemgetter(1))) for doc in documents ] else: documents, document_token_lookup = read_cache( name( f'./parsed_docs_{num_doc_tokens_to_consider}_tokens_limit_uniq_toks_qrels_and_106756.json', _names), lambda: prepare(document_lookup, document_title_to_id, num_tokens=num_doc_tokens_to_consider, token_set=doc_token_set)) if not rabbit.run_params.just_caches: train_query_lookup = read_cache('./robust_train_queries.json', get_robust_train_queries) train_query_name_to_id = read_cache( './train_query_name_to_id.json', lambda: create_id_lookup(train_query_lookup.keys())) train_queries, query_token_lookup = read_cache( './parsed_robust_queries_dict.json', lambda: prepare(train_query_lookup, train_query_name_to_id, token_lookup=document_token_lookup, token_set=doc_token_set, drop_if_any_unk=True)) query_tok_to_doc_tok = { idx: document_token_lookup.get(query_token) or document_token_lookup['<unk>'] for query_token, idx in query_token_lookup.items() } names = [RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]] if rabbit.train_params.use_pointwise_loss or not rabbit.run_params.just_caches: train_data = read_cache( name('./robust_train_query_results_tokens_qrels_and_106756.json', names), lambda: read_query_result( train_query_name_to_id, document_title_to_id, train_queries, path='./indri/query_result' + RANKER_NAME_TO_SUFFIX[ rabbit.train_params.ranking_set])) else: train_data = [] q_embed_len = rabbit.model_params.query_token_embed_len doc_embed_len = rabbit.model_params.document_token_embed_len if rabbit.model_params.append_difference or rabbit.model_params.append_hadamard: assert q_embed_len == doc_embed_len, 'Must use same size doc and query embeds when appending diff or hadamard' if q_embed_len == doc_embed_len: glove_lookup = get_glove_lookup( embedding_dim=q_embed_len, use_large_embed=rabbit.model_params.use_large_embed, use_word2vec=rabbit.model_params.use_word2vec) q_glove_lookup = glove_lookup doc_glove_lookup = glove_lookup else: q_glove_lookup = get_glove_lookup( embedding_dim=q_embed_len, use_large_embed=rabbit.model_params.use_large_embed, use_word2vec=rabbit.model_params.use_word2vec) doc_glove_lookup = get_glove_lookup( embedding_dim=doc_embed_len, use_large_embed=rabbit.model_params.use_large_embed, use_word2vec=rabbit.model_params.use_word2vec) num_query_tokens = len(query_token_lookup) num_doc_tokens = len(document_token_lookup) doc_encoder = None if use_pretrained_doc_encoder or rabbit.model_params.use_doc_out: doc_encoder, document_token_embeds = get_doc_encoder_and_embeddings( document_token_lookup, rabbit.model_params.only_use_last_out) if rabbit.model_params.use_glove: query_token_embeds_init = init_embedding(q_glove_lookup, query_token_lookup, num_query_tokens, query_token_embed_len) else: query_token_embeds_init = from_doc_to_query_embeds( document_token_embeds, document_token_lookup, query_token_lookup) if not rabbit.train_params.dont_freeze_pretrained_doc_encoder: dont_update(doc_encoder) if rabbit.model_params.use_doc_out: doc_encoder = None else: document_token_embeds = init_embedding(doc_glove_lookup, document_token_lookup, num_doc_tokens, document_token_embed_len) if rabbit.model_params.use_single_word_embed_set: query_token_embeds_init = document_token_embeds else: query_token_embeds_init = init_embedding(q_glove_lookup, query_token_lookup, num_query_tokens, query_token_embed_len) if not rabbit.train_params.dont_freeze_word_embeds: dont_update(document_token_embeds) dont_update(query_token_embeds_init) else: do_update(document_token_embeds) do_update(query_token_embeds_init) if rabbit.train_params.add_rel_score: query_token_embeds, additive = get_additive_regularized_embeds( query_token_embeds_init) rel_score = RelScore(query_token_embeds, document_token_embeds, rabbit.model_params, rabbit.train_params) else: query_token_embeds = query_token_embeds_init additive = None rel_score = None eval_query_lookup = get_robust_eval_queries() eval_query_name_document_title_rels = get_robust_rels() test_query_names = [] val_query_names = [] for query_name in eval_query_lookup: if len(val_query_names) >= 50: test_query_names.append(query_name) else: val_query_names.append(query_name) test_query_name_document_title_rels = _.pick( eval_query_name_document_title_rels, test_query_names) test_query_lookup = _.pick(eval_query_lookup, test_query_names) test_query_name_to_id = create_id_lookup(test_query_lookup.keys()) test_queries, __ = prepare(test_query_lookup, test_query_name_to_id, token_lookup=query_token_lookup) eval_ranking_candidates = read_query_test_rankings( './indri/query_result_test' + RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]) test_candidates_data = read_query_result( test_query_name_to_id, document_title_to_id, dict(zip(range(len(test_queries)), test_queries)), path='./indri/query_result_test' + RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]) test_ranking_candidates = process_raw_candidates(test_query_name_to_id, test_queries, document_title_to_id, test_query_names, eval_ranking_candidates) test_data = process_rels(test_query_name_document_title_rels, document_title_to_id, test_query_name_to_id, test_queries) val_query_name_document_title_rels = _.pick( eval_query_name_document_title_rels, val_query_names) val_query_lookup = _.pick(eval_query_lookup, val_query_names) val_query_name_to_id = create_id_lookup(val_query_lookup.keys()) val_queries, __ = prepare(val_query_lookup, val_query_name_to_id, token_lookup=query_token_lookup) val_candidates_data = read_query_result( val_query_name_to_id, document_title_to_id, dict(zip(range(len(val_queries)), val_queries)), path='./indri/query_result_test' + RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]) val_ranking_candidates = process_raw_candidates(val_query_name_to_id, val_queries, document_title_to_id, val_query_names, eval_ranking_candidates) val_data = process_rels(val_query_name_document_title_rels, document_title_to_id, val_query_name_to_id, val_queries) train_normalized_score_lookup = read_cache( name('./train_normalized_score_lookup.pkl', names), lambda: get_normalized_score_lookup(train_data)) test_normalized_score_lookup = get_normalized_score_lookup( test_candidates_data) val_normalized_score_lookup = get_normalized_score_lookup( val_candidates_data) if use_pointwise_loss: normalized_train_data = read_cache( name('./normalized_train_query_data_qrels_and_106756.json', names), lambda: normalize_scores_query_wise(train_data)) collate_fn = lambda samples: collate_query_samples( samples, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) train_dl = build_query_dataloader( documents, normalized_train_data, rabbit.train_params, rabbit.model_params, cache=name('train_ranking_qrels_and_106756.json', names), limit=10, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=train_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=False) test_dl = build_query_dataloader( documents, test_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=test_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) val_dl = build_query_dataloader( documents, val_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) model = PointwiseScorer(query_token_embeds, document_token_embeds, doc_encoder, rabbit.model_params, rabbit.train_params) else: if rabbit.train_params.use_noise_aware_loss: ranker_query_str_to_rankings = get_ranker_query_str_to_rankings( train_query_name_to_id, document_title_to_id, train_queries, limit=rabbit.train_params.num_snorkel_train_queries) query_names = reduce( lambda acc, query_to_ranking: acc.intersection( set(query_to_ranking.keys())) if len(acc) != 0 else set(query_to_ranking.keys()), ranker_query_str_to_rankings.values(), set()) all_ranked_lists_by_ranker = _.map_values( ranker_query_str_to_rankings, lambda query_to_ranking: [query_to_ranking[query] for query in query_names]) ranker_query_str_to_pairwise_bins = get_ranker_query_str_to_pairwise_bins( train_query_name_to_id, document_title_to_id, train_queries, limit=rabbit.train_params.num_train_queries) snorkeller = Snorkeller(ranker_query_str_to_pairwise_bins) snorkeller.train(all_ranked_lists_by_ranker) calc_marginals = snorkeller.calc_marginals else: calc_marginals = None collate_fn = lambda samples: collate_query_pairwise_samples( samples, use_bow_model=use_bow_model, calc_marginals=calc_marginals, use_dense=rabbit.model_params.use_dense) if rabbit.run_params.load_influences: try: with open(rabbit.run_params.influences_path) as fh: pairs_to_flip = defaultdict(set) for pair, influence in json.load(fh): if rabbit.train_params.use_pointwise_loss: condition = True else: condition = influence < rabbit.train_params.influence_thresh if condition: query = tuple(pair[1]) pairs_to_flip[query].add(tuple(pair[0])) except FileNotFoundError: pairs_to_flip = None else: pairs_to_flip = None train_dl = build_query_pairwise_dataloader( documents, train_data, rabbit.train_params, rabbit.model_params, pairs_to_flip=pairs_to_flip, cache=name('train_ranking_qrels_and_106756.json', names), limit=10, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=train_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=False) test_dl = build_query_pairwise_dataloader( documents, test_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=test_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) val_dl = build_query_pairwise_dataloader( documents, val_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True) val_rel_dl = build_query_pairwise_dataloader( documents, val_data, rabbit.train_params, rabbit.model_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, collate_fn=collate_fn, is_test=True, rel_vs_irrel=True, candidates=val_ranking_candidates, num_to_rank=rabbit.run_params.num_to_rank) model = PairwiseScorer(query_token_embeds, document_token_embeds, doc_encoder, rabbit.model_params, rabbit.train_params, use_bow_model=use_bow_model) train_ranking_dataset = RankingDataset( documents, train_dl.dataset.rankings, rabbit.train_params, rabbit.model_params, rabbit.run_params, query_tok_to_doc_tok=query_tok_to_doc_tok, normalized_score_lookup=train_normalized_score_lookup, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) test_ranking_dataset = RankingDataset( documents, test_ranking_candidates, rabbit.train_params, rabbit.model_params, rabbit.run_params, relevant=test_dl.dataset.rankings, query_tok_to_doc_tok=query_tok_to_doc_tok, cheat=rabbit.run_params.cheat, normalized_score_lookup=test_normalized_score_lookup, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) val_ranking_dataset = RankingDataset( documents, val_ranking_candidates, rabbit.train_params, rabbit.model_params, rabbit.run_params, relevant=val_dl.dataset.rankings, query_tok_to_doc_tok=query_tok_to_doc_tok, cheat=rabbit.run_params.cheat, normalized_score_lookup=val_normalized_score_lookup, use_bow_model=use_bow_model, use_dense=rabbit.model_params.use_dense) if rabbit.train_params.memorize_test: train_dl = test_dl train_ranking_dataset = test_ranking_dataset model_data = DataBunch(train_dl, val_rel_dl, test_dl, collate_fn=collate_fn, device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) multi_objective_model = MultiObjective(model, rabbit.train_params, rel_score, additive) model_to_save = multi_objective_model if rabbit.train_params.memorize_test: try: del train_data except: pass if not rabbit.run_params.just_caches: del document_lookup del train_query_lookup del query_token_lookup del document_token_lookup del train_queries try: del glove_lookup except UnboundLocalError: del q_glove_lookup del doc_glove_lookup if rabbit.run_params.load_model: try: multi_objective_model.load_state_dict( torch.load(rabbit.run_params.load_path)) except RuntimeError: dp = nn.DataParallel(multi_objective_model) dp.load_state_dict(torch.load(rabbit.run_params.load_path)) multi_objective_model = dp.module else: train_model(multi_objective_model, model_data, train_ranking_dataset, val_ranking_dataset, test_ranking_dataset, rabbit.train_params, rabbit.model_params, rabbit.run_params, experiment) if rabbit.train_params.fine_tune_on_val: fine_tune_model_data = DataBunch( val_rel_dl, val_rel_dl, test_dl, collate_fn=collate_fn, device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) train_model(multi_objective_model, fine_tune_model_data, val_ranking_dataset, val_ranking_dataset, test_ranking_dataset, rabbit.train_params, rabbit.model_params, rabbit.run_params, experiment, load_path=rabbit.run_params.load_path) multi_objective_model.eval() device = model_data.device gpu_multi_objective_model = multi_objective_model.to(device) if rabbit.run_params.calc_influence: if rabbit.run_params.freeze_all_but_last_for_influence: last_layer_idx = _.find_last_index( multi_objective_model.model.pointwise_scorer.layers, lambda layer: isinstance(layer, nn.Linear)) to_last_layer = lambda x: gpu_multi_objective_model( *x, to_idx=last_layer_idx) last_layer = gpu_multi_objective_model.model.pointwise_scorer.layers[ last_layer_idx] diff_wrt = [p for p in last_layer.parameters() if p.requires_grad] else: diff_wrt = None test_hvps = calc_test_hvps( multi_objective_model.loss, gpu_multi_objective_model, DeviceDataLoader(train_dl, device, collate_fn=collate_fn), val_rel_dl, rabbit.run_params, diff_wrt=diff_wrt, show_progress=True, use_softrank_influence=rabbit.run_params.use_softrank_influence) influences = [] if rabbit.train_params.use_pointwise_loss: num_real_samples = len(train_dl.dataset) else: num_real_samples = train_dl.dataset._num_pos_pairs if rabbit.run_params.freeze_all_but_last_for_influence: _sampler = SequentialSamplerWithLimit(train_dl.dataset, num_real_samples) _batch_sampler = BatchSampler(_sampler, rabbit.train_params.batch_size, False) _dl = DataLoader(train_dl.dataset, batch_sampler=_batch_sampler, collate_fn=collate_fn) sequential_train_dl = DeviceDataLoader(_dl, device, collate_fn=collate_fn) influences = calc_dataset_influence(gpu_multi_objective_model, to_last_layer, sequential_train_dl, test_hvps, sum_p=True).tolist() else: for i in progressbar(range(num_real_samples)): train_sample = train_dl.dataset[i] x, labels = to_device(collate_fn([train_sample]), device) device_train_sample = (x, labels.squeeze()) influences.append( calc_influence(multi_objective_model.loss, gpu_multi_objective_model, device_train_sample, test_hvps, diff_wrt=diff_wrt).sum().tolist()) with open(rabbit.run_params.influences_path, 'w+') as fh: json.dump([[train_dl.dataset[idx][1], influence] for idx, influence in enumerate(influences)], fh)
print("saving...") with open('plaats.json', 'w') as outfile: json.dump(geoJson, outfile) print("saved to file!") while offsetCount < maxOffsetCount: url = "https://data.pdok.nl/sparql?query=" + quote(query) try: with urlopen(Request(url, headers={'Accept': 'application/json'})) as url: data = json.loads(url.read().decode()) print(data) if len(data['results']) > 0: for value in data['results']['bindings']: value = pydash.map_values(value, lambda x: x['value']) g1 = shapely.wkt.loads(value['geometry']) g2 = geojson.Feature(geometry=g1, properties=value) del g2['properties']['geometry'] geoJson['features'].append(g2) print( "incrementing offset to " + str(offset) + " .... " + "current length of features ", len(geoJson['features'])) offset += limit offsetCount += 1 else: #break when we run out of results...if you have a lot of results will take A LONG TIME to run break except Exception as inst: saveToFile(json)
def update_parser_for_functions(self, modul_name, parsers, class_ref): parent_path_parser = parsers[modul_name] for function_name, function_ref in inspect.getmembers(class_ref): if inspect.isfunction(function_ref): groups = {} if not "callables" in parent_path_parser: parent_path_parser['callables'] = {} if not "subparsers" in parent_path_parser: parent_path_parser['subparsers'] = parent_path_parser[ 'parser'].add_subparsers( help=self.get_config_value(modul_name, 'sub_help')) if hasattr(function_ref, '_action_param_action'): if not 'action_nargs' in parent_path_parser: parent_path_parser['action_nargs'] = {} parent_path_parser['action_nargs'][ function_ref. __name__] = function_ref._action_param_nargs parent_path_parser['parser'].add_argument( f'--{function_name}', action=function_ref._action_param_action( *function_ref._action_param_args, **function_ref._action_param_kwargs), nargs=function_ref._action_param_nargs) continue if "subparsers" in parent_path_parser and hasattr( parent_path_parser['subparsers'], 'choices') and function_name in parent_path_parser[ 'subparsers'].choices: parser = parent_path_parser['subparsers'].choices[ function_name] else: parser = parent_path_parser['subparsers'].add_parser( function_name, description=self.get_config_value( f'{modul_name}.{function_name}', 'description')) if hasattr(function_ref, '_arg_groups'): groups = map_values( group_by(function_ref._arg_groups, 'name'), lambda groupArray: self.add_group_to_parrser( parser, groupArray[-1])) if function_name in parent_path_parser['callables']: sys.stderr.write( f'{function_name} in {parent_path_parser["callables"]["class_ref"].__name__} is being overwritten by {function_name} in {class_ref.__name__}' ) parent_path_parser['callables'][function_name] = { "parser": parser, "class_ref": class_ref, "function_name": function_name, "groups": groups } for param in inspect.signature( function_ref).parameters.values(): parser = parent_path_parser['callables'][function_name][ 'parser'] if param.annotation.__class__ == ParserArgType: args = tuple([f'--{param.name}']) if len( param.annotation.args ) == 0 else param.annotation.args if len(param.annotation.args ) > 0 and 'dest' not in param.annotation.kwargs: param.annotation.kwargs['dest'] = param.name if hasattr(param.annotation, 'group'): group = get(groups, param.annotation.group) if group: group.add_argument(*args, **param.annotation.kwargs) else: raise Exception( f'it appears that the group "{param.annotation.group}" is referenced by an arguement but not found when building the parser existing groups are {json.dumps(list(groups.keys()))}' ) else: parser.add_argument(*args, **param.annotation.kwargs) if param.annotation == int: parser.add_argument(f'--{param.name}', type=param.annotation) if param.annotation == str: parser.add_argument(f'--{param.name}', type=param.annotation)
def test_map_values(case, expected): assert _.map_values(*case) == expected
def get_raw_datasets(cursors, num_items): return _.map_values(cursors, _get_data_fetcher(num_items))