def test_mention_context_batch_sampler_many_last_ids():
    mentions_by_page = {
        0: [120],
        1: [130],
        2: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
    }
    cursor = get_mock_cursor(mentions_by_page)
    batch_size = 5
    page_id_order = [2, 0, 1]
    mentions_in_page_order = _.mapcat(
        page_id_order, lambda page_id: mentions_by_page[page_id])
    batch_sampler = MentionContextBatchSampler(cursor, page_id_order,
                                               batch_size)
    batches_seen = []
    indexes_seen = []
    for batch_num, batch_indexes in enumerate(batch_sampler):
        assert _.is_empty(_.intersection(batch_indexes, indexes_seen))
        indexes_seen.extend(batch_indexes)
        if batch_num == 0:
            assert _.is_empty(_.difference(batch_indexes, mentions_by_page[2]))
            assert len(batch_indexes) == batch_size
        elif batch_num == 1:
            assert _.is_empty(_.difference(batch_indexes, mentions_by_page[2]))
            assert len(batch_indexes) == batch_size
        elif batch_num == 2:
            assert len(_.intersection(batch_indexes, mentions_by_page[2])) == 2
            assert len(_.intersection(batch_indexes, mentions_by_page[0])) == 1
            assert len(_.intersection(batch_indexes, mentions_by_page[1])) == 1
            assert len(batch_indexes) == 4
        batches_seen.append(batch_num)
    assert _.is_empty(_.difference(mentions_in_page_order, indexes_seen))
    assert batches_seen == [0, 1, 2]
Exemplo n.º 2
0
def jaccard_similarity(f1, f2):
    # print(union(f1,f2),"\n")
    # print(len(union(f1,f2)),"\n\n")
    div_by = len(union(f1, f2))
    if div_by == 0:
        return 0
    return len(intersection(f1, f2)) / div_by
Exemplo n.º 3
0
def print_intersection_detail(title, f1, f2):
    #Retorna números
    print('\n')
    print(pad(title, 50, '-'))
    print('\n', f1, '\n\n', f2, '\n')
    print(intersection(f1, f2))
    print('\n')
def test_mention_context_batch_sampler():
    mentions_by_page = {
        0: [40, 50, 60, 70, 80, 90],
        1: [100],
        2: [0, 10, 20, 30]
    }
    cursor = get_mock_cursor(mentions_by_page)
    batch_size = 5
    page_id_order = [2, 0, 1]
    mentions_in_page_order = _.mapcat(
        page_id_order, lambda page_id: mentions_by_page[page_id])
    batch_sampler = MentionContextBatchSampler(cursor, page_id_order,
                                               batch_size)
    batches_seen = []
    indexes_seen = []
    for batch_num, batch_indexes in enumerate(batch_sampler):
        assert _.is_empty(_.intersection(batch_indexes, indexes_seen))
        indexes_seen.extend(batch_indexes)
        if batch_num == 0:
            assert len(set(batch_indexes) - {0, 10, 20, 30}) == 1
            assert any([
                mention in set(batch_indexes) - {0, 10, 20, 30}
                for mention in mentions_by_page[0]
            ])
            assert len(batch_indexes) == batch_size
        elif batch_num == 1:
            assert _.is_empty(set(batch_indexes) - set(mentions_by_page[0]))
            assert len(batch_indexes) == batch_size
        elif batch_num == 2:
            assert batch_indexes == [100]
            assert len(batch_indexes) == 1
        batches_seen.append(batch_num)
    assert _.is_empty(_.difference(mentions_in_page_order, indexes_seen))
    assert batches_seen == [0, 1, 2]
Exemplo n.º 5
0
def overlap_similarity(f1, f2):
    # print(len(intersection(f1,f2)))
    # print(min_([len(f1),len(f2)]))
    # print(len(intersection(f1,f2))/min_([len(f1),len(f2)]))
    div_by = min_([len(f1), len(f2)])
    if div_by == 0:
        return 0
    return len(intersection(f1, f2)) / div_by
Exemplo n.º 6
0
def calculate_ship_extension(ship: Ship, candidates: List[Coord]) -> List[Coord]:
    if len(ship) == 1:
        return find_adjacent_cells(ship.cells[0], candidates, only_orthogonal=True)
    increment = ship.cells[1] - ship.cells[0]
    first, last = ship.cells[0], ship.cells[-1]
    if increment.j == 0:  # vertical
        ext = [first + Coord((-1, 0)), last + Coord((1, 0))]
    else:  # horizontal
        ext = [first + Coord((0, -1)), last + Coord((0, 1))]

    return py_.intersection(ext, candidates)
Exemplo n.º 7
0
from pydash import uniq_by, mapcat, intersection
import sys
import requests

wsurl = "https://s3.amazonaws.com/challenge.getcrossbeam.com/public/"
rs1 = requests.get(wsurl + sys.argv[1] + ".json")
data1 = rs1.json()
names1 = uniq_by(data1['companies'], lambda r: r['domain'])
n1Ar = mapcat(names1, lambda n: n['domain'])
rs2 = requests.get(wsurl + sys.argv[2] + ".json")
data2 = rs2.json()
names2 = uniq_by(data2['companies'], lambda r: r['domain'])
n2Ar = mapcat(names2, lambda n: n['domain'])
names3 = intersection(n1Ar, n2Ar)
print(repr(len(names1)) + " " + repr(len(names2)) + " " + repr(len(names3)))
Exemplo n.º 8
0
def test_mention_context_dataset():
  cursor = get_mock_cursor()
  page_id_order = [3, 1, 2]
  batch_size = 5
  entity_candidates_prior = {'aa': {1: 20},
                             'bb': {0: 10, 1: 2},
                             'cc': {2: 3}}
  entity_label_lookup = dict(zip(range(5), range(5)))
  embedding_dim = 1
  embedding_dict = {'<PAD>': torch.tensor([0]),
                    '<UNK>': torch.tensor([2]),
                    '<MENTION_START_HERE>': torch.tensor([-1]),
                    '<MENTION_END_HERE>': torch.tensor([-2])}
  token_idx_lookup = dict(zip(embedding_dict.keys(),
                              range(len(embedding_dict))))
  embedding = nn.Embedding.from_pretrained(torch.stack([embedding_dict[token] for token in token_idx_lookup]))
  num_entities = 5
  num_candidates = 2
  dataset = MentionContextDataset(cursor,
                                  page_id_order,
                                  entity_candidates_prior,
                                  entity_label_lookup,
                                  embedding,
                                  token_idx_lookup,
                                  batch_size,
                                  num_entities,
                                  num_candidates)
  dataset._mention_infos = {0: {'mention': 'bb', 'offset': 9, 'page_id': 2, 'entity_id': 0, 'mention_id': 0},
                            1: {'mention': 'aa', 'offset': 6, 'page_id': 2, 'entity_id': 1, 'mention_id': 1},
                            2: {'mention': 'cc', 'offset': 0, 'page_id': 1, 'entity_id': 2, 'mention_id': 2},
                            3: {'mention': 'bb', 'offset': 3, 'page_id': 1, 'entity_id': 0, 'mention_id': 3},
                            4: {'mention': 'bb', 'offset': 3, 'page_id': 0, 'entity_id': 1, 'mention_id': 4}}
  dataset._page_content_lookup = {2: 'a b c aa bb',
                                  1: 'cc bb c b a',
                                  0: 'dd bb a b c'}
  dataset._sentence_spans_lookup = {2: [(0, 5), (6, 11)],
                                    1: [(0, 5), (6, 11)],
                                    0: [(0, 5), (6, 11)]}
  dataset._embedded_page_content_lookup = {2: [0, 1],
                                           1: [1, 2],
                                           0: [1]}
  dataset._entity_page_mentions_lookup = {2: [[0]],
                                          1: [[1]],
                                          0: [[1]]}
  dataset._mentions_per_page_ctr = {2: 2,
                                    1: 2,
                                    0: 1}
  expected_data = [{'sentence_splits': [['aa', 'bb'], ['bb']],
                    'label': 0,
                    'embedded_page_content': [0, 1],
                    'entity_page_mentions': [[0]],
                    'candidate_ids': torch.tensor([0, 1]),
                    'p_prior': torch.tensor([10/12, 2/12])},
                   {'sentence_splits': [['aa'], ['aa', 'bb']],
                    'label': 1,
                    'embedded_page_content': [0, 1],
                    'entity_page_mentions': [[0]],
                    'candidate_ids': torch.tensor([1]),
                    'p_prior': torch.tensor([1.0])},
                   {'sentence_splits': [['cc'], ['cc', 'bb']],
                    'label': 2,
                    'embedded_page_content': [1, 2],
                    'entity_page_mentions': [[1]],
                    'candidate_ids': torch.tensor([2]),
                    'p_prior': torch.tensor([1.0])},
                   {'sentence_splits': [['cc', 'bb'], ['bb']],
                    'label': 0,
                    'embedded_page_content': [1, 2],
                    'entity_page_mentions': [[1]],
                    'candidate_ids': torch.tensor([0, 1]),
                    'p_prior': torch.tensor([10/12, 2/12])},
                   {'sentence_splits': [['dd', 'bb'], ['bb']],
                    'label': 1,
                    'embedded_page_content': [1],
                    'entity_page_mentions': [[1]],
                    'candidate_ids': torch.tensor([0, 1]),
                    'p_prior': torch.tensor([10/12, 2/12])}]
  iterator = iter(dataset)
  dataset_values = [next(iterator) for _ in range(len(expected_data))]
  comparison = {'sentence_splits': _.is_equal,
                'label': _.is_equal,
                'embedded_page_content': _.is_equal,
                'entity_page_mentions': _.is_equal,
                'candidate_ids': compare_candidate_ids_tensor,
                'p_prior': lambda a, b: len(a) == len(_.intersection(a.tolist(), b.tolist()))}
  assert coll_compare_keys_by(expected_data, dataset_values, comparison)
Exemplo n.º 9
0
def test_intersection(case, expected):
    assert _.intersection(*case) == expected
Exemplo n.º 10
0
  'BGBLPGPL': []
}

with open('files/gene lists/dbVarCNGsBenignPCGenes.txt', 'r') as dbVarCNGsBenignPCGenes:
    benignGain = dbVarCNGsBenignPCGenes.read().splitlines()

with open('files/gene lists/dbVarCNLsBenignPCGenes.txt', 'r') as dbVarCNLsBenignPCGenes:
    benignLoss = dbVarCNLsBenignPCGenes.read().splitlines()

with open('files/gene lists/dbVarCNGPeaksPathogenicPCGenes.txt', 'r') as dbVarCNGPeaksPathogenicPCGenes:
    pathoGain = dbVarCNGPeaksPathogenicPCGenes.read().splitlines()

with open('files/gene lists/dbVarCNLPeaksPathogenicPCGenes.txt', 'r') as dbVarCNLPeaksPathogenicPCGenes:
    pathoLoss = dbVarCNLPeaksPathogenicPCGenes.read().splitlines()

lists['BGBLPGPL'] = pydash.intersection(benignGain, benignLoss, pathoGain, pathoLoss)

_BGBLPG = pydash.intersection(benignGain, benignLoss, pathoGain)
lists['BGBLPG'] = pydash.difference(_BGBLPG, pathoLoss)

_BGBLPL = pydash.intersection(benignGain, benignLoss, pathoLoss)
lists['BGBLPL'] = pydash.difference(_BGBLPL, pathoGain)

_BGPGPL = pydash.intersection(benignGain, pathoGain, pathoLoss)
lists['BGPGPL'] = pydash.difference(_BGPGPL, benignLoss)

_BLPGPL = pydash.intersection(benignLoss, pathoGain, pathoLoss)
lists['BLPGPL'] = pydash.difference(_BLPGPL, benignGain)

_BGBL = pydash.intersection(benignGain, benignLoss)
lists['BGBL'] = pydash.difference(_BGBL, pathoGain, pathoLoss)
Exemplo n.º 11
0
def main():
    printer = pprint.PrettyPrinter(indent=2)
    restaurant_columns = [
        'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price',
        'Rambience', 'franchise'
    ]
    restaurant_cat_columns = [
        'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price',
        'Rambience', 'franchise'
    ]
    user_columns = [
        'smoker', 'drink_level', 'dress_preference', 'ambience', 'transport',
        'marital_status', 'hijos', 'birth_year', 'interest', 'personality',
        'religion', 'activity', 'weight', 'budget', 'height'
    ]
    user_cat_columns = [
        'smoker', 'drink_level', 'dress_preference', 'ambience', 'transport',
        'marital_status', 'hijos', 'interest', 'personality', 'religion',
        'activity', 'budget'
    ]
    features = user_columns + restaurant_columns
    categorical_features = user_cat_columns + restaurant_cat_columns
    categorical_feature_indices = [
        features.index(feature_name) for feature_name in categorical_features
    ]
    restaurant_info = pd.read_csv('./data/geoplaces2.csv',
                                  dtype=dict(
                                      zip(restaurant_cat_columns, [str] *
                                          len(restaurant_cat_columns))))
    user_info = pd.read_csv('./data/userprofile.csv',
                            dtype=dict(
                                zip(user_cat_columns,
                                    [str] * len(user_cat_columns))))
    cuisine_type = pd.read_csv('./data/chefmozcuisine.csv')
    user_rating = pd.read_csv('./data/rating_final.csv')
    user_preferences = pd.read_csv('./data/usercuisine.csv')
    place_ids = _.intersection(user_rating['placeID'].tolist(),
                               restaurant_info['placeID'].tolist())
    user_ids = _.intersection(user_rating['userID'].tolist(),
                              user_info['userID'].tolist())
    joined_data_dict = {column: [] for column in features + ['rating']}
    for row in user_rating[['userID', 'placeID', 'rating']].iterrows():
        user_id = row[1]['userID']
        place_id = row[1]['placeID']
        if user_id in user_ids and place_id in place_ids:
            joined_data_dict['rating'].append(row[1]['rating'])
            for column in user_columns:
                joined_data_dict[column].append(
                    user_info[user_info['userID'] == user_id][column].iloc[0])
            for column in restaurant_columns:
                joined_data_dict[column].append(restaurant_info[
                    restaurant_info['placeID'] == place_id][column].iloc[0])
    joined_data = pd.DataFrame(joined_data_dict)
    data = joined_data.loc[:, joined_data.columns != 'rating']
    rating = joined_data['rating']
    encoded_data, categorical_names = encode_data(data, features,
                                                  categorical_features)
    X = encoded_data
    y = rating.values == 2
    one_hot_encoder = OneHotEncoder(
        categorical_features=categorical_feature_indices)
    one_hot_encoder.fit(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model = ensemble.RandomForestClassifier(class_weight='balanced')
    model.fit(one_hot_encoder.transform(X_train), y_train)
    predict_fn = lambda X: model.predict(one_hot_encoder.transform(X))
    predict_proba_fn = lambda X: model.predict_proba(
        one_hot_encoder.transform(X))
    report(predict_fn, X_test, y_test)
    explainer = train({
        'X_train': X_train.values,
        'y_train': y_train
    }, {
        'feature_names': features,
        'categorical_feature_indices': categorical_feature_indices,
        'categorical_names': categorical_names
    })
    explanation = inquire(
        explainer, {
            'instance': X_test.iloc[0],
            'predict_fn': predict_proba_fn,
            'num_features': 5
        })
    printer.pprint(explanation.as_list())
Exemplo n.º 12
0
def len_intersection(f1, f2):
    return len(intersection(f1, f2))