def test_mention_context_batch_sampler_many_last_ids(): mentions_by_page = { 0: [120], 1: [130], 2: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110] } cursor = get_mock_cursor(mentions_by_page) batch_size = 5 page_id_order = [2, 0, 1] mentions_in_page_order = _.mapcat( page_id_order, lambda page_id: mentions_by_page[page_id]) batch_sampler = MentionContextBatchSampler(cursor, page_id_order, batch_size) batches_seen = [] indexes_seen = [] for batch_num, batch_indexes in enumerate(batch_sampler): assert _.is_empty(_.intersection(batch_indexes, indexes_seen)) indexes_seen.extend(batch_indexes) if batch_num == 0: assert _.is_empty(_.difference(batch_indexes, mentions_by_page[2])) assert len(batch_indexes) == batch_size elif batch_num == 1: assert _.is_empty(_.difference(batch_indexes, mentions_by_page[2])) assert len(batch_indexes) == batch_size elif batch_num == 2: assert len(_.intersection(batch_indexes, mentions_by_page[2])) == 2 assert len(_.intersection(batch_indexes, mentions_by_page[0])) == 1 assert len(_.intersection(batch_indexes, mentions_by_page[1])) == 1 assert len(batch_indexes) == 4 batches_seen.append(batch_num) assert _.is_empty(_.difference(mentions_in_page_order, indexes_seen)) assert batches_seen == [0, 1, 2]
def jaccard_similarity(f1, f2): # print(union(f1,f2),"\n") # print(len(union(f1,f2)),"\n\n") div_by = len(union(f1, f2)) if div_by == 0: return 0 return len(intersection(f1, f2)) / div_by
def print_intersection_detail(title, f1, f2): #Retorna números print('\n') print(pad(title, 50, '-')) print('\n', f1, '\n\n', f2, '\n') print(intersection(f1, f2)) print('\n')
def test_mention_context_batch_sampler(): mentions_by_page = { 0: [40, 50, 60, 70, 80, 90], 1: [100], 2: [0, 10, 20, 30] } cursor = get_mock_cursor(mentions_by_page) batch_size = 5 page_id_order = [2, 0, 1] mentions_in_page_order = _.mapcat( page_id_order, lambda page_id: mentions_by_page[page_id]) batch_sampler = MentionContextBatchSampler(cursor, page_id_order, batch_size) batches_seen = [] indexes_seen = [] for batch_num, batch_indexes in enumerate(batch_sampler): assert _.is_empty(_.intersection(batch_indexes, indexes_seen)) indexes_seen.extend(batch_indexes) if batch_num == 0: assert len(set(batch_indexes) - {0, 10, 20, 30}) == 1 assert any([ mention in set(batch_indexes) - {0, 10, 20, 30} for mention in mentions_by_page[0] ]) assert len(batch_indexes) == batch_size elif batch_num == 1: assert _.is_empty(set(batch_indexes) - set(mentions_by_page[0])) assert len(batch_indexes) == batch_size elif batch_num == 2: assert batch_indexes == [100] assert len(batch_indexes) == 1 batches_seen.append(batch_num) assert _.is_empty(_.difference(mentions_in_page_order, indexes_seen)) assert batches_seen == [0, 1, 2]
def overlap_similarity(f1, f2): # print(len(intersection(f1,f2))) # print(min_([len(f1),len(f2)])) # print(len(intersection(f1,f2))/min_([len(f1),len(f2)])) div_by = min_([len(f1), len(f2)]) if div_by == 0: return 0 return len(intersection(f1, f2)) / div_by
def calculate_ship_extension(ship: Ship, candidates: List[Coord]) -> List[Coord]: if len(ship) == 1: return find_adjacent_cells(ship.cells[0], candidates, only_orthogonal=True) increment = ship.cells[1] - ship.cells[0] first, last = ship.cells[0], ship.cells[-1] if increment.j == 0: # vertical ext = [first + Coord((-1, 0)), last + Coord((1, 0))] else: # horizontal ext = [first + Coord((0, -1)), last + Coord((0, 1))] return py_.intersection(ext, candidates)
from pydash import uniq_by, mapcat, intersection import sys import requests wsurl = "https://s3.amazonaws.com/challenge.getcrossbeam.com/public/" rs1 = requests.get(wsurl + sys.argv[1] + ".json") data1 = rs1.json() names1 = uniq_by(data1['companies'], lambda r: r['domain']) n1Ar = mapcat(names1, lambda n: n['domain']) rs2 = requests.get(wsurl + sys.argv[2] + ".json") data2 = rs2.json() names2 = uniq_by(data2['companies'], lambda r: r['domain']) n2Ar = mapcat(names2, lambda n: n['domain']) names3 = intersection(n1Ar, n2Ar) print(repr(len(names1)) + " " + repr(len(names2)) + " " + repr(len(names3)))
def test_mention_context_dataset(): cursor = get_mock_cursor() page_id_order = [3, 1, 2] batch_size = 5 entity_candidates_prior = {'aa': {1: 20}, 'bb': {0: 10, 1: 2}, 'cc': {2: 3}} entity_label_lookup = dict(zip(range(5), range(5))) embedding_dim = 1 embedding_dict = {'<PAD>': torch.tensor([0]), '<UNK>': torch.tensor([2]), '<MENTION_START_HERE>': torch.tensor([-1]), '<MENTION_END_HERE>': torch.tensor([-2])} token_idx_lookup = dict(zip(embedding_dict.keys(), range(len(embedding_dict)))) embedding = nn.Embedding.from_pretrained(torch.stack([embedding_dict[token] for token in token_idx_lookup])) num_entities = 5 num_candidates = 2 dataset = MentionContextDataset(cursor, page_id_order, entity_candidates_prior, entity_label_lookup, embedding, token_idx_lookup, batch_size, num_entities, num_candidates) dataset._mention_infos = {0: {'mention': 'bb', 'offset': 9, 'page_id': 2, 'entity_id': 0, 'mention_id': 0}, 1: {'mention': 'aa', 'offset': 6, 'page_id': 2, 'entity_id': 1, 'mention_id': 1}, 2: {'mention': 'cc', 'offset': 0, 'page_id': 1, 'entity_id': 2, 'mention_id': 2}, 3: {'mention': 'bb', 'offset': 3, 'page_id': 1, 'entity_id': 0, 'mention_id': 3}, 4: {'mention': 'bb', 'offset': 3, 'page_id': 0, 'entity_id': 1, 'mention_id': 4}} dataset._page_content_lookup = {2: 'a b c aa bb', 1: 'cc bb c b a', 0: 'dd bb a b c'} dataset._sentence_spans_lookup = {2: [(0, 5), (6, 11)], 1: [(0, 5), (6, 11)], 0: [(0, 5), (6, 11)]} dataset._embedded_page_content_lookup = {2: [0, 1], 1: [1, 2], 0: [1]} dataset._entity_page_mentions_lookup = {2: [[0]], 1: [[1]], 0: [[1]]} dataset._mentions_per_page_ctr = {2: 2, 1: 2, 0: 1} expected_data = [{'sentence_splits': [['aa', 'bb'], ['bb']], 'label': 0, 'embedded_page_content': [0, 1], 'entity_page_mentions': [[0]], 'candidate_ids': torch.tensor([0, 1]), 'p_prior': torch.tensor([10/12, 2/12])}, {'sentence_splits': [['aa'], ['aa', 'bb']], 'label': 1, 'embedded_page_content': [0, 1], 'entity_page_mentions': [[0]], 'candidate_ids': torch.tensor([1]), 'p_prior': torch.tensor([1.0])}, {'sentence_splits': [['cc'], ['cc', 'bb']], 'label': 2, 'embedded_page_content': [1, 2], 'entity_page_mentions': [[1]], 'candidate_ids': torch.tensor([2]), 'p_prior': torch.tensor([1.0])}, {'sentence_splits': [['cc', 'bb'], ['bb']], 'label': 0, 'embedded_page_content': [1, 2], 'entity_page_mentions': [[1]], 'candidate_ids': torch.tensor([0, 1]), 'p_prior': torch.tensor([10/12, 2/12])}, {'sentence_splits': [['dd', 'bb'], ['bb']], 'label': 1, 'embedded_page_content': [1], 'entity_page_mentions': [[1]], 'candidate_ids': torch.tensor([0, 1]), 'p_prior': torch.tensor([10/12, 2/12])}] iterator = iter(dataset) dataset_values = [next(iterator) for _ in range(len(expected_data))] comparison = {'sentence_splits': _.is_equal, 'label': _.is_equal, 'embedded_page_content': _.is_equal, 'entity_page_mentions': _.is_equal, 'candidate_ids': compare_candidate_ids_tensor, 'p_prior': lambda a, b: len(a) == len(_.intersection(a.tolist(), b.tolist()))} assert coll_compare_keys_by(expected_data, dataset_values, comparison)
def test_intersection(case, expected): assert _.intersection(*case) == expected
'BGBLPGPL': [] } with open('files/gene lists/dbVarCNGsBenignPCGenes.txt', 'r') as dbVarCNGsBenignPCGenes: benignGain = dbVarCNGsBenignPCGenes.read().splitlines() with open('files/gene lists/dbVarCNLsBenignPCGenes.txt', 'r') as dbVarCNLsBenignPCGenes: benignLoss = dbVarCNLsBenignPCGenes.read().splitlines() with open('files/gene lists/dbVarCNGPeaksPathogenicPCGenes.txt', 'r') as dbVarCNGPeaksPathogenicPCGenes: pathoGain = dbVarCNGPeaksPathogenicPCGenes.read().splitlines() with open('files/gene lists/dbVarCNLPeaksPathogenicPCGenes.txt', 'r') as dbVarCNLPeaksPathogenicPCGenes: pathoLoss = dbVarCNLPeaksPathogenicPCGenes.read().splitlines() lists['BGBLPGPL'] = pydash.intersection(benignGain, benignLoss, pathoGain, pathoLoss) _BGBLPG = pydash.intersection(benignGain, benignLoss, pathoGain) lists['BGBLPG'] = pydash.difference(_BGBLPG, pathoLoss) _BGBLPL = pydash.intersection(benignGain, benignLoss, pathoLoss) lists['BGBLPL'] = pydash.difference(_BGBLPL, pathoGain) _BGPGPL = pydash.intersection(benignGain, pathoGain, pathoLoss) lists['BGPGPL'] = pydash.difference(_BGPGPL, benignLoss) _BLPGPL = pydash.intersection(benignLoss, pathoGain, pathoLoss) lists['BLPGPL'] = pydash.difference(_BLPGPL, benignGain) _BGBL = pydash.intersection(benignGain, benignLoss) lists['BGBL'] = pydash.difference(_BGBL, pathoGain, pathoLoss)
def main(): printer = pprint.PrettyPrinter(indent=2) restaurant_columns = [ 'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price', 'Rambience', 'franchise' ] restaurant_cat_columns = [ 'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'price', 'Rambience', 'franchise' ] user_columns = [ 'smoker', 'drink_level', 'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos', 'birth_year', 'interest', 'personality', 'religion', 'activity', 'weight', 'budget', 'height' ] user_cat_columns = [ 'smoker', 'drink_level', 'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos', 'interest', 'personality', 'religion', 'activity', 'budget' ] features = user_columns + restaurant_columns categorical_features = user_cat_columns + restaurant_cat_columns categorical_feature_indices = [ features.index(feature_name) for feature_name in categorical_features ] restaurant_info = pd.read_csv('./data/geoplaces2.csv', dtype=dict( zip(restaurant_cat_columns, [str] * len(restaurant_cat_columns)))) user_info = pd.read_csv('./data/userprofile.csv', dtype=dict( zip(user_cat_columns, [str] * len(user_cat_columns)))) cuisine_type = pd.read_csv('./data/chefmozcuisine.csv') user_rating = pd.read_csv('./data/rating_final.csv') user_preferences = pd.read_csv('./data/usercuisine.csv') place_ids = _.intersection(user_rating['placeID'].tolist(), restaurant_info['placeID'].tolist()) user_ids = _.intersection(user_rating['userID'].tolist(), user_info['userID'].tolist()) joined_data_dict = {column: [] for column in features + ['rating']} for row in user_rating[['userID', 'placeID', 'rating']].iterrows(): user_id = row[1]['userID'] place_id = row[1]['placeID'] if user_id in user_ids and place_id in place_ids: joined_data_dict['rating'].append(row[1]['rating']) for column in user_columns: joined_data_dict[column].append( user_info[user_info['userID'] == user_id][column].iloc[0]) for column in restaurant_columns: joined_data_dict[column].append(restaurant_info[ restaurant_info['placeID'] == place_id][column].iloc[0]) joined_data = pd.DataFrame(joined_data_dict) data = joined_data.loc[:, joined_data.columns != 'rating'] rating = joined_data['rating'] encoded_data, categorical_names = encode_data(data, features, categorical_features) X = encoded_data y = rating.values == 2 one_hot_encoder = OneHotEncoder( categorical_features=categorical_feature_indices) one_hot_encoder.fit(X) X_train, X_test, y_train, y_test = train_test_split(X, y) model = ensemble.RandomForestClassifier(class_weight='balanced') model.fit(one_hot_encoder.transform(X_train), y_train) predict_fn = lambda X: model.predict(one_hot_encoder.transform(X)) predict_proba_fn = lambda X: model.predict_proba( one_hot_encoder.transform(X)) report(predict_fn, X_test, y_test) explainer = train({ 'X_train': X_train.values, 'y_train': y_train }, { 'feature_names': features, 'categorical_feature_indices': categorical_feature_indices, 'categorical_names': categorical_names }) explanation = inquire( explainer, { 'instance': X_test.iloc[0], 'predict_fn': predict_proba_fn, 'num_features': 5 }) printer.pprint(explanation.as_list())
def len_intersection(f1, f2): return len(intersection(f1, f2))