Exemplo n.º 1
0
def query():
    w = pickle.load(open('weights_from_query.pkl', 'rb')).squeeze()
    topk_vals, topk_idxs = torch.topk(w, 30)
    bottomk_vals, bottomk_idxs = torch.topk(-w, 30)
    docs, lookup = pickle.load(open('parsed_robust_queries.pkl', 'rb'))
    tf, df, idf = count_me(docs)
    inv_lookup = _.invert(lookup)
    print('Top30: ', [inv_lookup[idx] for idx in topk_idxs.tolist()])
    print('Bottom30: ', [inv_lookup[idx] for idx in bottomk_idxs.tolist()])
    glove = get_glove_lookup()
    glove_by_idx = _.map_keys(
        glove, lambda vec, token: lookup[token]
        if token in lookup else lookup['<unk>'])
    norms_by_idx = _.map_values(glove_by_idx, torch.norm)
    idxs_in_order = list(norms_by_idx.keys())
    idfs_in_order = torch.tensor([idf[idx] for idx in idxs_in_order])
    dfs_in_order = torch.tensor([df[idx] for idx in idxs_in_order])
    tfs_in_order = torch.tensor([tf[idx] for idx in idxs_in_order])
    norms_in_order = torch.tensor([norms_by_idx[idx] for idx in idxs_in_order])
    w_subset = w[torch.tensor(idxs_in_order)]
    print(np.corrcoef(w_subset, tfs_in_order)[0, 1])
    print(np.corrcoef(w_subset, dfs_in_order)[0, 1])
    print(np.corrcoef(w_subset, idfs_in_order)[0, 1])
    print(np.corrcoef(w_subset, norms_in_order)[0, 1])
    print(np.corrcoef(w_subset, np.log(tfs_in_order + 1))[0, 1])
    print(np.corrcoef(w_subset, np.log(dfs_in_order))[0, 1])
    print(np.corrcoef(w_subset, np.log(idfs_in_order))[0, 1])
    print(np.corrcoef(w_subset, np.log(norms_in_order + 1))[0, 1])
Exemplo n.º 2
0
def check_fleet_config(
        fleet: Any,
        is_setup_stage=False) -> Tuple[bool, Optional[Dict[int, int]]]:
    """
    Проверяет конфигурацию флота (1 4палубный, 2 3палубных итд)
    :param fleet: список кораблей
    :param is_setup_stage: Если True, то отключает проверку на недостающие корабли (считается, что поле находится в
    процессе заполнения и игрок еще не выставил все корабли)
    :return: (bool, [dict])
    Если флот собран корректно, возвращает True, None
    Если есть лишние корабли, возвращает False, None
    Если кораблей не хватает, возвращает True или False (в зависимости от is_setup_stage) и список слотов
    """
    lengths = map(len, fleet)
    config = Counter(lengths)

    if config == SHIP_CONFIG:
        return True, None

    # Checking for extra ships
    configs = group_by_keys((config, SHIP_CONFIG), 0)
    diff = py_.map_values(configs, lambda counts: counts[1] - counts[0])
    extra_ships = any(py_.map_(list(diff.values()), lambda x: x < 0))
    if extra_ships:
        return False, None

    missing_ships = {k: v for k, v in diff.items() if v > 0}
    if missing_ships:
        return is_setup_stage, missing_ships
Exemplo n.º 3
0
def get_query_str_to_pairwise_bins(query_name_to_id,
                                   document_title_to_id,
                                   queries,
                                   path,
                                   limit=None):
    pairwise_bins_by_query = defaultdict(list)
    with open(path) as fh:
        while True:
            if limit is not None and len(pairwise_bins_by_query) >= limit:
                break
            line = fh.readline()
            if line:
                query_name, __, doc_title, __, __, ___ = line.strip().split(
                    ' ')
                if query_name not in query_name_to_id: continue
                if doc_title not in document_title_to_id: continue
                query_id = query_name_to_id[query_name]
                if query_id not in queries:
                    query_id = str(query_id)
                    if query_id not in queries: continue
                pairwise_bins_by_query[str(queries[query_id])[1:-1]].append(
                    document_title_to_id[doc_title])
            else:
                break
        result = _.map_values(dict(pairwise_bins_by_query), get_pairwise_bins)
        return defaultdict(lambda: (set(), set()), result)
Exemplo n.º 4
0
def _drop_overlapping_mentions(link_contexts):
  entity_mention_pairs = sum(_.map_values(link_contexts,
                                          lambda mentions, entity: [[entity, mention] for mention in mentions]).values(),
                             [])
  __, reduced_link_contexts = reduce(_drop_overlapping_mentions_reducer,
                                     entity_mention_pairs,
                                     ([], {}))
  return reduced_link_contexts
Exemplo n.º 5
0
def problem2():
    lines = read_input()
    sleeps = parse(lines)
    sleep_maps = pydash.map_values(sleeps, sleep_map)
    sorted_minutes = list(
        sorted(flatten_minutes(sleep_maps), key=lambda s: s[2]))
    best = sorted_minutes[-1]
    print(best[0] * best[1])
Exemplo n.º 6
0
def main():
    load_dotenv(dotenv_path='.env')
    EL_DATABASE_NAME = os.getenv("EL_DBNAME")
    DATABASE_USER = os.getenv("DBUSER")
    DATABASE_PASSWORD = os.getenv("DBPASS")
    DATABASE_HOST = os.getenv("DBHOST")
    connection = pymysql.connect(host=DATABASE_HOST,
                                 user=DATABASE_USER,
                                 password=DATABASE_PASSWORD,
                                 db=EL_DATABASE_NAME,
                                 charset='utf8mb4',
                                 use_unicode=True,
                                 cursorclass=pymysql.cursors.SSDictCursor)
    try:
        with connection.cursor() as cursor:
            cursor.execute("SET NAMES utf8mb4;")
            cursor.execute("SET CHARACTER SET utf8mb4;")
            cursor.execute("SET character_set_connection=utf8mb4;")
            cursor.execute(
                'select mention, entity_id from entity_mentions_text')
            candidates_prior = defaultdict(lambda: defaultdict(int))
            entity_labels = {}
            for row in progressbar(cursor):
                if row['entity_id'] not in entity_labels:
                    entity_labels[row['entity_id']] = len(entity_labels)
                entity_label = entity_labels[row['entity_id']]
                candidates_prior[row['mention']][entity_label] += 1

            cursor.execute(
                'select distinct entity_id, entity from entity_mentions_text')
            for row in progressbar(cursor):
                if row['entity_id'] not in entity_labels:
                    entity_labels[row['entity_id']] = len(entity_labels)
                entity_label = entity_labels[row['entity_id']]
                candidates_prior[row['entity']][entity_label] += 1

            cursor.execute(
                'select distinct preredirect, entity_id from mentions m join entity_mentions em on em.mention_id = m.id'
            )
            for row in progressbar(cursor):
                if row['entity_id'] not in entity_labels:
                    entity_labels[row['entity_id']] = len(entity_labels)
                entity_label = entity_labels[row['entity_id']]
                candidates_prior[row['preredirect']][entity_label] += 1
            with open('lookups.pkl', 'wb') as lookup_file:
                pickle.dump(
                    {
                        'lookups': {
                            'entity_candidates_prior':
                            _.map_values(dict(candidates_prior), dict),
                            'entity_labels':
                            entity_labels
                        },
                        'train_size': 1.0
                    }, lookup_file)
    finally:
        connection.close()
Exemplo n.º 7
0
def main():
  load_dotenv(dotenv_path='.env')
  EL_DATABASE_NAME = os.getenv("EL_DBNAME")
  DATABASE_USER = os.getenv("DBUSER")
  DATABASE_PASSWORD = os.getenv("DBPASS")
  DATABASE_HOST = os.getenv("DBHOST")
  connection = pymysql.connect(host=DATABASE_HOST,
                               user=DATABASE_USER,
                               password=DATABASE_PASSWORD,
                               db=EL_DATABASE_NAME,
                               charset='utf8mb4',
                               use_unicode=True,
                               cursorclass=pymysql.cursors.DictCursor)
  try:
    with connection.cursor() as cursor:
      cursor.execute("SET NAMES utf8mb4;")
      cursor.execute("SET CHARACTER SET utf8mb4;")
      cursor.execute("SET character_set_connection=utf8mb4;")
      cursor.execute('select mention, entity_id, page_id from entity_mentions_text')
      candidates_prior = defaultdict(lambda: defaultdict(int))
      entity_labels = {}
      train_size = 0.8
      try:
        with open('./page_id_order.pkl', 'rb') as f:
          page_id_order = pickle.load(f)
      except Exception as e:
        raise type(e)(str(e) + '\n' + 'Create `page_id_order.pkl` by running `create_page_id_order.py`').with_traceback(sys.exc_info()[2])
      num_train_pages = int(len(page_id_order) * train_size)
      train_page_id_order = page_id_order[:num_train_pages]
      for row in cursor.fetchall():
        if row['entity_id'] not in entity_labels:
          entity_labels[row['entity_id']] = len(entity_labels)
        if row['page_id'] not in train_page_id_order: continue
        entity_label = entity_labels[row['entity_id']]
        candidates_prior[row['mention']][entity_label] += 1

      cursor.execute('select distinct entity_id, entity from entity_mentions_text')
      for row in cursor.fetchall():
        if row['entity_id'] not in entity_labels:
          entity_labels[row['entity_id']] = len(entity_labels)
        entity_label = entity_labels[row['entity_id']]
        candidates_prior[row['entity']][entity_label] += 1

      cursor.execute('select distinct preredirect, entity_id from mentions m join entity_mentions em on em.mention_id = m.id')
      for row in cursor.fetchall():
        if row['entity_id'] not in entity_labels:
          entity_labels[row['entity_id']] = len(entity_labels)
        entity_label = entity_labels[row['entity_id']]
        candidates_prior[row['preredirect']][entity_label] += 1
      with open('lookups.pkl', 'wb') as lookup_file:
        pickle.dump({'lookups': {'entity_candidates_prior': _.map_values(dict(candidates_prior), dict),
                                 'entity_labels': entity_labels},
                     'train_size': train_size},
                    lookup_file)
  finally:
    connection.close()
Exemplo n.º 8
0
def process_raw_candidates(query_name_to_id, queries, document_title_to_id,
                           query_names, raw_ranking_candidates):
    ranking_candidates = _.pick(raw_ranking_candidates, query_names)
    lookup_by_title = lambda title: document_title_to_id.get(title) or 0
    test_ranking_candidates = _.map_values(
        ranking_candidates,
        lambda candidate_names: _.map_(candidate_names, lookup_by_title))
    return _.map_keys(
        test_ranking_candidates, lambda ranking, query_name: str(queries[
            query_name_to_id[query_name]])[1:-1])
Exemplo n.º 9
0
    def call_from_cli(self, command, args, verbose=True):
        command = command.replace('-', '_')

        if Component.instance:
            self = Component.instance

        if hasattr(self, command):
            command = getattr(self, command)

            method_container = command

            if hasattr(command, 'redirect'):
                method_container = command.redirect

            params = inspect.signature(method_container).parameters.values()
            is_static = isinstance(
                method_container,
                types.FunctionType) and list(params)[0].name != 'self'

            if has_named_args(args):
                named = named_args_as_positional(args, params, self.name,
                                                 method_container.__name__)
                args = named['args']
                named['properties'] = _.map_values(named['properties'],
                                                   string_to_any)
                self.init(named['properties'])

            last_param = list(params)[-1]
            is_consuming_rest = last_param.kind == last_param.VAR_POSITIONAL

            if not is_consuming_rest and len(args) > len(params):
                result = {
                    'error':
                    f'Wrong number of arguments passed to {command.__name__}: expected {len(params)} instead of {len(args)}.'
                }
            else:
                args = list(map(string_to_any, args))

                if not is_static:
                    args.insert(0, self)

                self.called_from_cli = True
                command.as_cli = True

                result = command(*args)
        else:
            result = {'error': f'{self.name} has no method {command}.'}

        if verbose:
            if result:
                print(respond(result))
        else:
            return result
Exemplo n.º 10
0
def merge_mentions(processed_pages):
  '''merge the link contexts from a list of pages'''
  concat = lambda dest, src: dest + src if dest else src
  link_contexts = reduce(lambda acc, val: _.merge_with(acc, val, iteratee=concat),
                         [processed_page['link_contexts'] for processed_page in processed_pages],
                         {})
  entity_counts = reduce(lambda acc, val: _.merge_with(acc, val, iteratee=concat),
                          [processed_page['entity_counts'] for processed_page in processed_pages],
                          {})
  return _.map_values(link_contexts,
                      lambda val, key: {'link_contexts': val,
                                        'entity_counts': entity_counts[key]})
Exemplo n.º 11
0
def process_page(redirects_lookup, page, is_seed_page=False):
  cleaned_page = clean_page(page)
  document_info = {'source_id': cleaned_page['pageID'],
                   'title': cleaned_page['title'],
                   'text': cleaned_page['plaintext'],
                   'categories': cleaned_page['categories'],
                   'is_disambiguation_page': cleaned_page['isDisambiguation'],
                   'is_seed_page': is_seed_page}
  link_contexts = get_link_contexts_using_heuristics(redirects_lookup, cleaned_page)
  entity_counts = _.map_values(link_contexts, len)
  return {'document_info': document_info,
          'link_contexts': link_contexts,
          'entity_counts': entity_counts}
Exemplo n.º 12
0
def test_process_page():
    with open('test/fixtures/parade_page.json') as f:
        parade_page = json.load(f)
    with open('test/fixtures/parade_page_contexts.json') as f:
        parade_page_contexts = json.load(f)
    redirects_lookup = {}
    processed_page = pp.process_page(redirects_lookup, parade_page)
    assert processed_page['document_info']['title'] == parade_page['title']
    assert processed_page['document_info']['text'] == parade_page['plaintext']
    assert processed_page['document_info']['categories'] == parade_page[
        'categories']
    assert processed_page['link_contexts'] == parade_page_contexts
    assert processed_page['entity_counts'] == _.map_values(
        parade_page_contexts, len)
Exemplo n.º 13
0
def test_get_page_iobes():
  with open('test/fixtures/parade_page_db.json') as f:
    parade_page = json.load(f)
  with open('test/fixtures/parade_page_contexts.json') as f:
    filter_out_of_bounds = lambda mention: mention['offset'] < len(parade_page['content'])
    parade_page_contexts = _.map_values(json.load(f),
                                        lambda mentions: list(filter(filter_out_of_bounds, mentions)))
  context_pairs = _.mapcat(_.to_pairs(parade_page_contexts),
                           lambda pair: [[pair[0], mention] for mention in pair[1]])
  contexts = _.sort_by(context_pairs,
                       lambda title_mention: title_mention[1]['offset'])
  mentions = _.flat_map(contexts, _.last)
  mention_link_titles = list(map(_.head, contexts))
  assert parade_iobes == iobes.get_page_iobes(parade_page, mentions, mention_link_titles)
Exemplo n.º 14
0
 def __init__(self,
              cursor,
              page_id_order,
              entity_candidates_prior,
              entity_label_lookup,
              embedding,
              token_idx_lookup,
              batch_size,
              num_entities,
              num_candidates,
              cheat=False,
              buffer_scale=1,
              min_mentions=1,
              use_fast_sampler=False,
              use_wiki2vec=False,
              start_from_page_num=0):
     self.page_id_order = page_id_order
     self.entity_candidates_prior = entity_candidates_prior
     self.entity_label_lookup = _.map_values(entity_label_lookup,
                                             torch.tensor)
     self.entity_id_lookup = {
         int(label): entity_id
         for entity_id, label in self.entity_label_lookup.items()
     }
     self.embedding = embedding
     self.token_idx_lookup = token_idx_lookup
     self.cursor = cursor
     self.batch_size = batch_size
     self.num_entities = num_entities
     self.num_candidates = num_candidates
     self._sentence_spans_lookup = {}
     self._page_content_lookup = {}
     self._embedded_page_content_lookup = {}
     self._entity_page_mentions_lookup = {}
     self._mentions_per_page_ctr = {}
     self._mention_infos = {}
     self._candidate_strs_lookup = {}
     self._bag_of_nouns_lookup = {}
     self.page_ctr = start_from_page_num
     self.cheat = cheat
     self.buffer_scale = buffer_scale
     self.min_mentions = min_mentions
     self.use_fast_sampler = use_fast_sampler
     self.use_wiki2vec = use_wiki2vec
     # if self.use_fast_sampler: assert not self.use_wiki2vec, 'train wiki2vec locally'
     if self.min_mentions > 1:
         query = 'select id from entities where num_mentions >= ' + str(
             self.min_mentions)
         cursor.execute(query)
         self.valid_entity_ids = set(row['id'] for row in cursor.fetchall())
Exemplo n.º 15
0
def get_votes_by_bill(leg_id_to_pol_id,
                      state_votes_path='data/statehvotes.json') -> Dict[str, Dict[str, Set[int]]]:
  def _split_for_against_abstain(bill_info):
    leg_ids_to_votes = bill_info['votes']
    vote_for, vote_against, vote_abstain = set(), set(), set()
    for leg_id, vote in leg_ids_to_votes.items():
      if leg_id not in leg_id_to_pol_id: continue
      if vote == 'yes': vote_for.add(leg_id_to_pol_id[leg_id])
      elif vote == 'no': vote_against.add(leg_id_to_pol_id[leg_id])
      else: vote_abstain.add(leg_id_to_pol_id[leg_id])
    return {'for': vote_for, 'against': vote_against, 'abstain': vote_abstain}
  with open(state_votes_path) as fh:
    votes = json.load(fh)
  return _.map_values(votes, _split_for_against_abstain)
Exemplo n.º 16
0
    def _read_attributes(self):
        '''
        Returns
        -------
        dict
            Value of each attribute in remote context.

            For each attribute, if type is not supported (i.e., not a plain old
            data type), value is set to ``None``.

        See also
        --------
        :meth:`_write_attribute`
        '''
        return py_.map_values(self._attributes, lambda v, k:
                              self._read_attribute(k, None))
Exemplo n.º 17
0
def test_process_page_with_redirects():
    with open('test/fixtures/parade_page.json') as f:
        parade_page = json.load(f)
    with open('test/fixtures/parade_page_contexts.json') as f:
        parade_page_contexts = json.load(f)
    redirects_lookup = {"Fort de Goede Hoop": "Kaapstad"}
    processed_page = pp.process_page(redirects_lookup, parade_page)
    assert processed_page['document_info']['title'] == parade_page['title']
    assert processed_page['document_info']['text'] == parade_page['plaintext']
    assert processed_page['document_info']['categories'] == parade_page[
        'categories']
    parade_page_contexts["Kaapstad"].insert(
        1,
        parade_page_contexts.pop("Fort de Goede Hoop")[0])
    assert processed_page['link_contexts'] == parade_page_contexts
    assert processed_page['entity_counts'] == _.map_values(
        parade_page_contexts, len)
Exemplo n.º 18
0
def count_me(docs):
    tf = {0: 0, 1: 0}
    df = {0: 1, 1: 1}
    for doc in docs:
        in_doc = set()
        for idx in doc:
            in_doc.add(idx)
            if idx in tf:
                tf[idx] += 1
            else:
                tf[idx] = 1
        for idx in in_doc:
            if idx in df:
                df[idx] += 1
            else:
                df[idx] = 1
    idf = _.map_values(df, lambda cnt: 1 / cnt)
    return tf, df, idf
Exemplo n.º 19
0
def nest(collection, *properties):
    """This method is like :func:`group_by` except that it supports nested
    grouping by multiple string `properties`. If only a single key is given, it
    is like calling ``group_by(collection, prop)``.

    Args:
        collection (list|dict): Collection to iterate over.
        *properties (str): Properties to nest by.

    Returns:
        dict: Results of nested grouping by `properties`.

    Example:

        >>> results = nest([{'shape': 'square', 'color': 'red', 'qty': 5},\
                            {'shape': 'square', 'color': 'blue', 'qty': 10},\
                            {'shape': 'square', 'color': 'orange', 'qty': 5},\
                            {'shape': 'circle', 'color': 'yellow', 'qty': 5},\
                            {'shape': 'circle', 'color': 'pink', 'qty': 10},\
                            {'shape': 'oval', 'color': 'purple', 'qty': 5}],\
                           'shape', 'qty')
        >>> expected = {\
            'square': {5: [{'shape': 'square', 'color': 'red', 'qty': 5},\
                           {'shape': 'square', 'color': 'orange', 'qty': 5}],\
                       10: [{'shape': 'square', 'color': 'blue', 'qty': 10}]},\
            'circle': {5: [{'shape': 'circle', 'color': 'yellow', 'qty': 5}],\
                       10: [{'shape': 'circle', 'color': 'pink', 'qty': 10}]},\
            'oval': {5: [{'shape': 'oval', 'color': 'purple', 'qty': 5}]}}
        >>> results == expected
        True

    .. versionadded:: 4.3.0
    """
    if not properties:
        return collection

    properties = pyd.flatten(properties)
    first, rest = properties[0], properties[1:]

    return pyd.map_values(group_by(collection, first),
                          lambda value: nest(value, *rest))
Exemplo n.º 20
0
def nest(collection, *properties):
    """This method is like :func:`group_by` except that it supports nested
    grouping by multiple string `properties`. If only a single key is given, it
    is like calling ``group_by(collection, prop)``.

    Args:
        collection (list|dict): Collection to iterate over.
        *properties (str): Properties to nest by.

    Returns:
        dict: Results of nested grouping by `properties`.

    Example:

        >>> results = nest([{'shape': 'square', 'color': 'red', 'qty': 5},\
                            {'shape': 'square', 'color': 'blue', 'qty': 10},\
                            {'shape': 'square', 'color': 'orange', 'qty': 5},\
                            {'shape': 'circle', 'color': 'yellow', 'qty': 5},\
                            {'shape': 'circle', 'color': 'pink', 'qty': 10},\
                            {'shape': 'oval', 'color': 'purple', 'qty': 5}],\
                           'shape', 'qty')
        >>> expected = {\
            'square': {5: [{'shape': 'square', 'color': 'red', 'qty': 5},\
                           {'shape': 'square', 'color': 'orange', 'qty': 5}],\
                       10: [{'shape': 'square', 'color': 'blue', 'qty': 10}]},\
            'circle': {5: [{'shape': 'circle', 'color': 'yellow', 'qty': 5}],\
                       10: [{'shape': 'circle', 'color': 'pink', 'qty': 10}]},\
            'oval': {5: [{'shape': 'oval', 'color': 'purple', 'qty': 5}]}}
        >>> results == expected
        True

    .. versionadded:: 4.3.0
    """
    if not properties:
        return collection

    properties = pyd.flatten(properties)
    first, rest = properties[0], properties[1:]

    return pyd.map_values(group_by(collection, first),
                          lambda value: nest(value, *rest))
Exemplo n.º 21
0
def test_embed_page_content():
    embedding_dict = _.map_values(
        {
            '<PAD>': [-1],
            '<UNK>': [0],
            'MENTION_START_HERE': [-2],
            'MENTION_END_HERE': [-3],
            'a': [1],
            'b': [2],
            'c': [3],
            'd': [4]
        }, torch.tensor)
    token_idx_lookup = dict(
        zip(embedding_dict.keys(), range(len(embedding_dict))))
    embedding = nn.Embedding.from_pretrained(
        torch.stack([embedding_dict[token] for token in token_idx_lookup]))
    page_mention_infos = [{'offset': 2, 'mention': 'b c'}]
    page_content = 'a b c d'
    embedded = torch.tensor([[1], [-2], [2], [3], [-3], [4]])
    assert torch.equal(
        dt.embed_page_content(embedding, token_idx_lookup, page_content,
                              page_mention_infos), embedded)
 def __init__(self,
              cursor,
              page_id_order,
              entity_candidates_prior,
              entity_label_lookup,
              embedding,
              token_idx_lookup,
              batch_size,
              num_entities,
              num_candidates,
              entity_embeds,
              cheat=False,
              buffer_scale=1,
              min_mentions=1,
              use_fast_sampler=False,
              use_wiki2vec=False,
              use_sum_encoder=False,
              start_from_page_num=0,
              ablation=['local_context', 'document_context', 'prior']):
     self._candidate_strs_lookup = read_cache(
         './candidate_strs_lookup.pkl', lambda: get_str_lookup(cursor))
     self.page_id_order = page_id_order
     self.entity_candidates_prior = entity_candidates_prior
     self.entity_label_lookup = _.map_values(entity_label_lookup,
                                             torch.tensor)
     self.entity_id_lookup = {
         int(label): entity_id
         for entity_id, label in self.entity_label_lookup.items()
     }
     self.embedding = embedding
     self.token_idx_lookup = token_idx_lookup
     self.cursor = cursor
     self.batch_size = batch_size
     self.num_entities = num_entities
     self.num_candidates = num_candidates
     self._sentence_spans_lookup = {}
     self._page_content_lookup = {}
     self._embedded_page_content_lookup = {}
     self._page_token_cnts_lookup = {}
     self._entity_page_mentions_lookup = {}
     self._mentions_per_page_ctr = defaultdict(int)
     self._mention_infos = {}
     self._bag_of_nouns_lookup = {}
     self.page_ctr = start_from_page_num
     self.cheat = cheat
     self.buffer_scale = buffer_scale
     self.min_mentions = min_mentions
     self.use_fast_sampler = use_fast_sampler
     self.use_wiki2vec = use_wiki2vec
     self.use_sum_encoder = use_sum_encoder
     # if self.use_fast_sampler: assert not self.use_wiki2vec, 'train wiki2vec locally'
     self.prior_approx_mapping = u.get_prior_approx_mapping(
         self.entity_candidates_prior)
     self.page_content_lim = 5000
     if self.min_mentions > 1:
         query = 'select id from entities where num_mentions >= ' + str(
             self.min_mentions)
         cursor.execute(query)
         self.valid_entity_ids = set(row['id'] for row in cursor.fetchall())
     self.ablation = ablation
     self.entity_embeds = entity_embeds
     self._offset = 0
     with open('./entity_to_row_id.pkl', 'rb') as fh:
         entity_id_to_row = pickle.load(fh)
     self.token_ctr_by_entity_id = DocLookup('./desc_unstemmed_fs.npz',
                                             entity_id_to_row,
                                             default_value={1: 1},
                                             use_default=True)
     self.to_entity_id = read_cache(
         './page_to_entity_id.pkl',
         lambda: get_page_id_to_entity_id_lookup(cursor))
Exemplo n.º 23
0
def main():
    global model_to_save
    global experiment
    global rabbit
    rabbit = MyRabbit(args)
    if rabbit.model_params.dont_limit_num_uniq_tokens:
        raise NotImplementedError()
    if rabbit.model_params.frame_as_qa: raise NotImplementedError
    if rabbit.run_params.drop_val_loss_calc: raise NotImplementedError
    if rabbit.run_params.use_softrank_influence and not rabbit.run_params.freeze_all_but_last_for_influence:
        raise NotImplementedError
    if rabbit.train_params.weight_influence: raise NotImplementedError
    experiment = Experiment(rabbit.train_params + rabbit.model_params +
                            rabbit.run_params)
    print('Model name:', experiment.model_name)
    use_pretrained_doc_encoder = rabbit.model_params.use_pretrained_doc_encoder
    use_pointwise_loss = rabbit.train_params.use_pointwise_loss
    query_token_embed_len = rabbit.model_params.query_token_embed_len
    document_token_embed_len = rabbit.model_params.document_token_embed_len
    _names = []
    if not rabbit.model_params.dont_include_titles:
        _names.append('with_titles')
    if rabbit.train_params.num_doc_tokens_to_consider != -1:
        _names.append('num_doc_toks_' +
                      str(rabbit.train_params.num_doc_tokens_to_consider))
    if not rabbit.run_params.just_caches:
        if rabbit.model_params.dont_include_titles:
            document_lookup = read_cache(name('./doc_lookup.json', _names),
                                         get_robust_documents)
        else:
            document_lookup = read_cache(name('./doc_lookup.json', _names),
                                         get_robust_documents_with_titles)
    num_doc_tokens_to_consider = rabbit.train_params.num_doc_tokens_to_consider
    document_title_to_id = read_cache(
        './document_title_to_id.json',
        lambda: create_id_lookup(document_lookup.keys()))
    with open('./caches/106756_most_common_doc.json', 'r') as fh:
        doc_token_set = set(json.load(fh))
        tokenizer = Tokenizer()
        tokenized = set(
            sum(
                tokenizer.process_all(list(
                    get_robust_eval_queries().values())), []))
        doc_token_set = doc_token_set.union(tokenized)
    use_bow_model = not any([
        rabbit.model_params[attr] for attr in
        ['use_doc_out', 'use_cnn', 'use_lstm', 'use_pretrained_doc_encoder']
    ])
    use_bow_model = use_bow_model and not rabbit.model_params.dont_use_bow
    if use_bow_model:
        documents, document_token_lookup = read_cache(
            name(f'./docs_fs_tokens_limit_uniq_toks_qrels_and_106756.pkl',
                 _names),
            lambda: prepare_fs(document_lookup,
                               document_title_to_id,
                               num_tokens=num_doc_tokens_to_consider,
                               token_set=doc_token_set))
        if rabbit.model_params.keep_top_uniq_terms is not None:
            documents = [
                dict(
                    nlargest(rabbit.model_params.keep_top_uniq_terms,
                             _.to_pairs(doc), itemgetter(1)))
                for doc in documents
            ]
    else:
        documents, document_token_lookup = read_cache(
            name(
                f'./parsed_docs_{num_doc_tokens_to_consider}_tokens_limit_uniq_toks_qrels_and_106756.json',
                _names), lambda: prepare(document_lookup,
                                         document_title_to_id,
                                         num_tokens=num_doc_tokens_to_consider,
                                         token_set=doc_token_set))
    if not rabbit.run_params.just_caches:
        train_query_lookup = read_cache('./robust_train_queries.json',
                                        get_robust_train_queries)
        train_query_name_to_id = read_cache(
            './train_query_name_to_id.json',
            lambda: create_id_lookup(train_query_lookup.keys()))
    train_queries, query_token_lookup = read_cache(
        './parsed_robust_queries_dict.json',
        lambda: prepare(train_query_lookup,
                        train_query_name_to_id,
                        token_lookup=document_token_lookup,
                        token_set=doc_token_set,
                        drop_if_any_unk=True))
    query_tok_to_doc_tok = {
        idx: document_token_lookup.get(query_token)
        or document_token_lookup['<unk>']
        for query_token, idx in query_token_lookup.items()
    }
    names = [RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set]]
    if rabbit.train_params.use_pointwise_loss or not rabbit.run_params.just_caches:
        train_data = read_cache(
            name('./robust_train_query_results_tokens_qrels_and_106756.json',
                 names), lambda: read_query_result(
                     train_query_name_to_id,
                     document_title_to_id,
                     train_queries,
                     path='./indri/query_result' + RANKER_NAME_TO_SUFFIX[
                         rabbit.train_params.ranking_set]))
    else:
        train_data = []
    q_embed_len = rabbit.model_params.query_token_embed_len
    doc_embed_len = rabbit.model_params.document_token_embed_len
    if rabbit.model_params.append_difference or rabbit.model_params.append_hadamard:
        assert q_embed_len == doc_embed_len, 'Must use same size doc and query embeds when appending diff or hadamard'
    if q_embed_len == doc_embed_len:
        glove_lookup = get_glove_lookup(
            embedding_dim=q_embed_len,
            use_large_embed=rabbit.model_params.use_large_embed,
            use_word2vec=rabbit.model_params.use_word2vec)
        q_glove_lookup = glove_lookup
        doc_glove_lookup = glove_lookup
    else:
        q_glove_lookup = get_glove_lookup(
            embedding_dim=q_embed_len,
            use_large_embed=rabbit.model_params.use_large_embed,
            use_word2vec=rabbit.model_params.use_word2vec)
        doc_glove_lookup = get_glove_lookup(
            embedding_dim=doc_embed_len,
            use_large_embed=rabbit.model_params.use_large_embed,
            use_word2vec=rabbit.model_params.use_word2vec)
    num_query_tokens = len(query_token_lookup)
    num_doc_tokens = len(document_token_lookup)
    doc_encoder = None
    if use_pretrained_doc_encoder or rabbit.model_params.use_doc_out:
        doc_encoder, document_token_embeds = get_doc_encoder_and_embeddings(
            document_token_lookup, rabbit.model_params.only_use_last_out)
        if rabbit.model_params.use_glove:
            query_token_embeds_init = init_embedding(q_glove_lookup,
                                                     query_token_lookup,
                                                     num_query_tokens,
                                                     query_token_embed_len)
        else:
            query_token_embeds_init = from_doc_to_query_embeds(
                document_token_embeds, document_token_lookup,
                query_token_lookup)
        if not rabbit.train_params.dont_freeze_pretrained_doc_encoder:
            dont_update(doc_encoder)
        if rabbit.model_params.use_doc_out:
            doc_encoder = None
    else:
        document_token_embeds = init_embedding(doc_glove_lookup,
                                               document_token_lookup,
                                               num_doc_tokens,
                                               document_token_embed_len)
        if rabbit.model_params.use_single_word_embed_set:
            query_token_embeds_init = document_token_embeds
        else:
            query_token_embeds_init = init_embedding(q_glove_lookup,
                                                     query_token_lookup,
                                                     num_query_tokens,
                                                     query_token_embed_len)
    if not rabbit.train_params.dont_freeze_word_embeds:
        dont_update(document_token_embeds)
        dont_update(query_token_embeds_init)
    else:
        do_update(document_token_embeds)
        do_update(query_token_embeds_init)
    if rabbit.train_params.add_rel_score:
        query_token_embeds, additive = get_additive_regularized_embeds(
            query_token_embeds_init)
        rel_score = RelScore(query_token_embeds, document_token_embeds,
                             rabbit.model_params, rabbit.train_params)
    else:
        query_token_embeds = query_token_embeds_init
        additive = None
        rel_score = None
    eval_query_lookup = get_robust_eval_queries()
    eval_query_name_document_title_rels = get_robust_rels()
    test_query_names = []
    val_query_names = []
    for query_name in eval_query_lookup:
        if len(val_query_names) >= 50: test_query_names.append(query_name)
        else: val_query_names.append(query_name)
    test_query_name_document_title_rels = _.pick(
        eval_query_name_document_title_rels, test_query_names)
    test_query_lookup = _.pick(eval_query_lookup, test_query_names)
    test_query_name_to_id = create_id_lookup(test_query_lookup.keys())
    test_queries, __ = prepare(test_query_lookup,
                               test_query_name_to_id,
                               token_lookup=query_token_lookup)
    eval_ranking_candidates = read_query_test_rankings(
        './indri/query_result_test' +
        RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set])
    test_candidates_data = read_query_result(
        test_query_name_to_id,
        document_title_to_id,
        dict(zip(range(len(test_queries)), test_queries)),
        path='./indri/query_result_test' +
        RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set])
    test_ranking_candidates = process_raw_candidates(test_query_name_to_id,
                                                     test_queries,
                                                     document_title_to_id,
                                                     test_query_names,
                                                     eval_ranking_candidates)
    test_data = process_rels(test_query_name_document_title_rels,
                             document_title_to_id, test_query_name_to_id,
                             test_queries)
    val_query_name_document_title_rels = _.pick(
        eval_query_name_document_title_rels, val_query_names)
    val_query_lookup = _.pick(eval_query_lookup, val_query_names)
    val_query_name_to_id = create_id_lookup(val_query_lookup.keys())
    val_queries, __ = prepare(val_query_lookup,
                              val_query_name_to_id,
                              token_lookup=query_token_lookup)
    val_candidates_data = read_query_result(
        val_query_name_to_id,
        document_title_to_id,
        dict(zip(range(len(val_queries)), val_queries)),
        path='./indri/query_result_test' +
        RANKER_NAME_TO_SUFFIX[rabbit.train_params.ranking_set])
    val_ranking_candidates = process_raw_candidates(val_query_name_to_id,
                                                    val_queries,
                                                    document_title_to_id,
                                                    val_query_names,
                                                    eval_ranking_candidates)
    val_data = process_rels(val_query_name_document_title_rels,
                            document_title_to_id, val_query_name_to_id,
                            val_queries)
    train_normalized_score_lookup = read_cache(
        name('./train_normalized_score_lookup.pkl', names),
        lambda: get_normalized_score_lookup(train_data))
    test_normalized_score_lookup = get_normalized_score_lookup(
        test_candidates_data)
    val_normalized_score_lookup = get_normalized_score_lookup(
        val_candidates_data)
    if use_pointwise_loss:
        normalized_train_data = read_cache(
            name('./normalized_train_query_data_qrels_and_106756.json', names),
            lambda: normalize_scores_query_wise(train_data))
        collate_fn = lambda samples: collate_query_samples(
            samples,
            use_bow_model=use_bow_model,
            use_dense=rabbit.model_params.use_dense)
        train_dl = build_query_dataloader(
            documents,
            normalized_train_data,
            rabbit.train_params,
            rabbit.model_params,
            cache=name('train_ranking_qrels_and_106756.json', names),
            limit=10,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=train_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=False)
        test_dl = build_query_dataloader(
            documents,
            test_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=test_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        val_dl = build_query_dataloader(
            documents,
            val_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=val_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        model = PointwiseScorer(query_token_embeds, document_token_embeds,
                                doc_encoder, rabbit.model_params,
                                rabbit.train_params)
    else:
        if rabbit.train_params.use_noise_aware_loss:
            ranker_query_str_to_rankings = get_ranker_query_str_to_rankings(
                train_query_name_to_id,
                document_title_to_id,
                train_queries,
                limit=rabbit.train_params.num_snorkel_train_queries)
            query_names = reduce(
                lambda acc, query_to_ranking: acc.intersection(
                    set(query_to_ranking.keys()))
                if len(acc) != 0 else set(query_to_ranking.keys()),
                ranker_query_str_to_rankings.values(), set())
            all_ranked_lists_by_ranker = _.map_values(
                ranker_query_str_to_rankings, lambda query_to_ranking:
                [query_to_ranking[query] for query in query_names])
            ranker_query_str_to_pairwise_bins = get_ranker_query_str_to_pairwise_bins(
                train_query_name_to_id,
                document_title_to_id,
                train_queries,
                limit=rabbit.train_params.num_train_queries)
            snorkeller = Snorkeller(ranker_query_str_to_pairwise_bins)
            snorkeller.train(all_ranked_lists_by_ranker)
            calc_marginals = snorkeller.calc_marginals
        else:
            calc_marginals = None
        collate_fn = lambda samples: collate_query_pairwise_samples(
            samples,
            use_bow_model=use_bow_model,
            calc_marginals=calc_marginals,
            use_dense=rabbit.model_params.use_dense)
        if rabbit.run_params.load_influences:
            try:
                with open(rabbit.run_params.influences_path) as fh:
                    pairs_to_flip = defaultdict(set)
                    for pair, influence in json.load(fh):
                        if rabbit.train_params.use_pointwise_loss:
                            condition = True
                        else:
                            condition = influence < rabbit.train_params.influence_thresh
                        if condition:
                            query = tuple(pair[1])
                            pairs_to_flip[query].add(tuple(pair[0]))
            except FileNotFoundError:
                pairs_to_flip = None
        else:
            pairs_to_flip = None
        train_dl = build_query_pairwise_dataloader(
            documents,
            train_data,
            rabbit.train_params,
            rabbit.model_params,
            pairs_to_flip=pairs_to_flip,
            cache=name('train_ranking_qrels_and_106756.json', names),
            limit=10,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=train_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=False)
        test_dl = build_query_pairwise_dataloader(
            documents,
            test_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=test_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        val_dl = build_query_pairwise_dataloader(
            documents,
            val_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=val_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True)
        val_rel_dl = build_query_pairwise_dataloader(
            documents,
            val_data,
            rabbit.train_params,
            rabbit.model_params,
            query_tok_to_doc_tok=query_tok_to_doc_tok,
            normalized_score_lookup=val_normalized_score_lookup,
            use_bow_model=use_bow_model,
            collate_fn=collate_fn,
            is_test=True,
            rel_vs_irrel=True,
            candidates=val_ranking_candidates,
            num_to_rank=rabbit.run_params.num_to_rank)
        model = PairwiseScorer(query_token_embeds,
                               document_token_embeds,
                               doc_encoder,
                               rabbit.model_params,
                               rabbit.train_params,
                               use_bow_model=use_bow_model)
    train_ranking_dataset = RankingDataset(
        documents,
        train_dl.dataset.rankings,
        rabbit.train_params,
        rabbit.model_params,
        rabbit.run_params,
        query_tok_to_doc_tok=query_tok_to_doc_tok,
        normalized_score_lookup=train_normalized_score_lookup,
        use_bow_model=use_bow_model,
        use_dense=rabbit.model_params.use_dense)
    test_ranking_dataset = RankingDataset(
        documents,
        test_ranking_candidates,
        rabbit.train_params,
        rabbit.model_params,
        rabbit.run_params,
        relevant=test_dl.dataset.rankings,
        query_tok_to_doc_tok=query_tok_to_doc_tok,
        cheat=rabbit.run_params.cheat,
        normalized_score_lookup=test_normalized_score_lookup,
        use_bow_model=use_bow_model,
        use_dense=rabbit.model_params.use_dense)
    val_ranking_dataset = RankingDataset(
        documents,
        val_ranking_candidates,
        rabbit.train_params,
        rabbit.model_params,
        rabbit.run_params,
        relevant=val_dl.dataset.rankings,
        query_tok_to_doc_tok=query_tok_to_doc_tok,
        cheat=rabbit.run_params.cheat,
        normalized_score_lookup=val_normalized_score_lookup,
        use_bow_model=use_bow_model,
        use_dense=rabbit.model_params.use_dense)
    if rabbit.train_params.memorize_test:
        train_dl = test_dl
        train_ranking_dataset = test_ranking_dataset
    model_data = DataBunch(train_dl,
                           val_rel_dl,
                           test_dl,
                           collate_fn=collate_fn,
                           device=torch.device('cuda') if
                           torch.cuda.is_available() else torch.device('cpu'))
    multi_objective_model = MultiObjective(model, rabbit.train_params,
                                           rel_score, additive)
    model_to_save = multi_objective_model
    if rabbit.train_params.memorize_test:
        try:
            del train_data
        except:
            pass
    if not rabbit.run_params.just_caches:
        del document_lookup
        del train_query_lookup
    del query_token_lookup
    del document_token_lookup
    del train_queries
    try:
        del glove_lookup
    except UnboundLocalError:
        del q_glove_lookup
        del doc_glove_lookup
    if rabbit.run_params.load_model:
        try:
            multi_objective_model.load_state_dict(
                torch.load(rabbit.run_params.load_path))
        except RuntimeError:
            dp = nn.DataParallel(multi_objective_model)
            dp.load_state_dict(torch.load(rabbit.run_params.load_path))
            multi_objective_model = dp.module
    else:
        train_model(multi_objective_model, model_data, train_ranking_dataset,
                    val_ranking_dataset, test_ranking_dataset,
                    rabbit.train_params, rabbit.model_params,
                    rabbit.run_params, experiment)
    if rabbit.train_params.fine_tune_on_val:
        fine_tune_model_data = DataBunch(
            val_rel_dl,
            val_rel_dl,
            test_dl,
            collate_fn=collate_fn,
            device=torch.device('cuda')
            if torch.cuda.is_available() else torch.device('cpu'))
        train_model(multi_objective_model,
                    fine_tune_model_data,
                    val_ranking_dataset,
                    val_ranking_dataset,
                    test_ranking_dataset,
                    rabbit.train_params,
                    rabbit.model_params,
                    rabbit.run_params,
                    experiment,
                    load_path=rabbit.run_params.load_path)
    multi_objective_model.eval()
    device = model_data.device
    gpu_multi_objective_model = multi_objective_model.to(device)
    if rabbit.run_params.calc_influence:
        if rabbit.run_params.freeze_all_but_last_for_influence:
            last_layer_idx = _.find_last_index(
                multi_objective_model.model.pointwise_scorer.layers,
                lambda layer: isinstance(layer, nn.Linear))
            to_last_layer = lambda x: gpu_multi_objective_model(
                *x, to_idx=last_layer_idx)
            last_layer = gpu_multi_objective_model.model.pointwise_scorer.layers[
                last_layer_idx]
            diff_wrt = [p for p in last_layer.parameters() if p.requires_grad]
        else:
            diff_wrt = None
        test_hvps = calc_test_hvps(
            multi_objective_model.loss,
            gpu_multi_objective_model,
            DeviceDataLoader(train_dl, device, collate_fn=collate_fn),
            val_rel_dl,
            rabbit.run_params,
            diff_wrt=diff_wrt,
            show_progress=True,
            use_softrank_influence=rabbit.run_params.use_softrank_influence)
        influences = []
        if rabbit.train_params.use_pointwise_loss:
            num_real_samples = len(train_dl.dataset)
        else:
            num_real_samples = train_dl.dataset._num_pos_pairs
        if rabbit.run_params.freeze_all_but_last_for_influence:
            _sampler = SequentialSamplerWithLimit(train_dl.dataset,
                                                  num_real_samples)
            _batch_sampler = BatchSampler(_sampler,
                                          rabbit.train_params.batch_size,
                                          False)
            _dl = DataLoader(train_dl.dataset,
                             batch_sampler=_batch_sampler,
                             collate_fn=collate_fn)
            sequential_train_dl = DeviceDataLoader(_dl,
                                                   device,
                                                   collate_fn=collate_fn)
            influences = calc_dataset_influence(gpu_multi_objective_model,
                                                to_last_layer,
                                                sequential_train_dl,
                                                test_hvps,
                                                sum_p=True).tolist()
        else:
            for i in progressbar(range(num_real_samples)):
                train_sample = train_dl.dataset[i]
                x, labels = to_device(collate_fn([train_sample]), device)
                device_train_sample = (x, labels.squeeze())
                influences.append(
                    calc_influence(multi_objective_model.loss,
                                   gpu_multi_objective_model,
                                   device_train_sample,
                                   test_hvps,
                                   diff_wrt=diff_wrt).sum().tolist())
        with open(rabbit.run_params.influences_path, 'w+') as fh:
            json.dump([[train_dl.dataset[idx][1], influence]
                       for idx, influence in enumerate(influences)], fh)
Exemplo n.º 24
0
    print("saving...")
    with open('plaats.json', 'w') as outfile:
        json.dump(geoJson, outfile)
    print("saved to file!")


while offsetCount < maxOffsetCount:
    url = "https://data.pdok.nl/sparql?query=" + quote(query)
    try:
        with urlopen(Request(url, headers={'Accept':
                                           'application/json'})) as url:
            data = json.loads(url.read().decode())
            print(data)
            if len(data['results']) > 0:
                for value in data['results']['bindings']:
                    value = pydash.map_values(value, lambda x: x['value'])
                    g1 = shapely.wkt.loads(value['geometry'])
                    g2 = geojson.Feature(geometry=g1, properties=value)
                    del g2['properties']['geometry']
                    geoJson['features'].append(g2)
                print(
                    "incrementing offset to " + str(offset) + " ....  " +
                    "current length of features ", len(geoJson['features']))
                offset += limit
                offsetCount += 1

            else:
                #break when we run out of results...if you have a lot of results will     take A LONG TIME to run
                break
    except Exception as inst:
        saveToFile(json)
Exemplo n.º 25
0
    def update_parser_for_functions(self, modul_name, parsers, class_ref):

        parent_path_parser = parsers[modul_name]

        for function_name, function_ref in inspect.getmembers(class_ref):
            if inspect.isfunction(function_ref):
                groups = {}
                if not "callables" in parent_path_parser:
                    parent_path_parser['callables'] = {}
                if not "subparsers" in parent_path_parser:
                    parent_path_parser['subparsers'] = parent_path_parser[
                        'parser'].add_subparsers(
                            help=self.get_config_value(modul_name, 'sub_help'))
                if hasattr(function_ref, '_action_param_action'):
                    if not 'action_nargs' in parent_path_parser:
                        parent_path_parser['action_nargs'] = {}

                    parent_path_parser['action_nargs'][
                        function_ref.
                        __name__] = function_ref._action_param_nargs
                    parent_path_parser['parser'].add_argument(
                        f'--{function_name}',
                        action=function_ref._action_param_action(
                            *function_ref._action_param_args,
                            **function_ref._action_param_kwargs),
                        nargs=function_ref._action_param_nargs)
                    continue
                if "subparsers" in parent_path_parser and hasattr(
                        parent_path_parser['subparsers'],
                        'choices') and function_name in parent_path_parser[
                            'subparsers'].choices:
                    parser = parent_path_parser['subparsers'].choices[
                        function_name]
                else:
                    parser = parent_path_parser['subparsers'].add_parser(
                        function_name,
                        description=self.get_config_value(
                            f'{modul_name}.{function_name}', 'description'))
                if hasattr(function_ref, '_arg_groups'):
                    groups = map_values(
                        group_by(function_ref._arg_groups, 'name'),
                        lambda groupArray: self.add_group_to_parrser(
                            parser, groupArray[-1]))
                if function_name in parent_path_parser['callables']:
                    sys.stderr.write(
                        f'{function_name} in {parent_path_parser["callables"]["class_ref"].__name__} is being overwritten by {function_name} in {class_ref.__name__}'
                    )

                parent_path_parser['callables'][function_name] = {
                    "parser": parser,
                    "class_ref": class_ref,
                    "function_name": function_name,
                    "groups": groups
                }

                for param in inspect.signature(
                        function_ref).parameters.values():

                    parser = parent_path_parser['callables'][function_name][
                        'parser']
                    if param.annotation.__class__ == ParserArgType:

                        args = tuple([f'--{param.name}']) if len(
                            param.annotation.args
                        ) == 0 else param.annotation.args
                        if len(param.annotation.args
                               ) > 0 and 'dest' not in param.annotation.kwargs:
                            param.annotation.kwargs['dest'] = param.name
                        if hasattr(param.annotation, 'group'):
                            group = get(groups, param.annotation.group)
                            if group:
                                group.add_argument(*args,
                                                   **param.annotation.kwargs)
                            else:
                                raise Exception(
                                    f'it appears that the group "{param.annotation.group}" is referenced by an arguement but not found when building the parser existing groups are {json.dumps(list(groups.keys()))}'
                                )
                        else:
                            parser.add_argument(*args,
                                                **param.annotation.kwargs)
                    if param.annotation == int:
                        parser.add_argument(f'--{param.name}',
                                            type=param.annotation)
                    if param.annotation == str:
                        parser.add_argument(f'--{param.name}',
                                            type=param.annotation)
Exemplo n.º 26
0
def test_map_values(case, expected):
    assert _.map_values(*case) == expected
Exemplo n.º 27
0
def get_raw_datasets(cursors, num_items):
  return _.map_values(cursors, _get_data_fetcher(num_items))