def build_flat_dataset(df, G, emails, percent_neg=1, dr=False, n_cores=4): """ """ df = df.copy()[["sender", "recipients", "mid"]] if not dr: G = G.to_undirected() # emails = set(self.id2email) emails = set(emails) # create fake paires randomly if n_cores > 1: df_neg = parallelize_dataframe(df, sample_neg_rec_df, num_cores=n_cores, emails=emails, G=G, percent_neg=percent_neg) else: df_neg = sample_neg_rec_df(df, emails=emails, G=G, percent_neg=percent_neg) # flatten the recipients df_flat_rec = flatmap(df, "recipients", "recipient") df_flat_neg = flatmap(df_neg.drop("recipients", axis=1), "negs", "recipient") # add labels: 0 for fake recipient, 1 for others df_flat_rec["label"] = 1 df_flat_neg["label"] = 0 # concat neg and real recipient paires df_flat = pd.concat((df_flat_rec, df_flat_neg), axis=0) # return df_flat
def scrap_rows_soups(context_message: str, rows_soups: Iterable[List[BeautifulSoup]]) -> List[ScrappedChapterRelease]: """ formats a row content to represent it as a ScrappedChapterRelease object""" scrapped_chapters = [] for row_number, row_cells in enumerate(rows_soups): if len(row_cells) != EXPECTED_ELEM_BY_LINE: message = f"row {row_number} for {context_message} does not have 5 cells. Skipping it." \ f"\nRow was:\n {repr(row_cells)}" if len(row_cells) > 1: warnings.warn(message, ScrappingWarning) else: logging.info(message) continue volume_str = row_cells[VOL_COLUMN_INDEX].get_text() volume = None if volume_str.strip(): try: volume = int(volume_str) except ValueError as e: warnings.warn(f"Failed to convert non empty volume str to int for {context_message}. Error was {e}", ScrappingWarning) chapter_string = row_cells[CHAPTER_COLUMN_INDEX].get_text() group = row_cells[GROUPS_COLUMN_INDEX].get_text() chapters_elements = [chapter_string] for splitting_chars in SPLITTING_CHAPTER_CHARS: chapters_elements = flatmap(lambda elem: elem.split(splitting_chars), chapters_elements) # no interpolation as inference rule is too complex to code as of now given the diversity of possibilities. scrapped_chapters.extend(ScrappedChapterRelease(group, chapter, volume) for chapter in chapters_elements) return scrapped_chapters
def extend_simplices(S, K, q): C = set() if q == 2 or len(S) == 0: for s in K: if len(s) == q: C.add(tuple(sorted(s))) elif len(s) > q: for c in combinations(s, q): C.add(tuple(sorted(c))) return list(C) inv_index = defaultdict(set) for idx in range(len(K)): s = K[idx] if len(s) >= q: for v in s: inv_index[v].add(idx) if len(inv_index) == 0: return [] for s in S: tmp = inv_index[s[0]] for idx in range(1, len(s)): tmp = tmp & inv_index[s[idx]] if len(tmp) > 0: tmp = set(ut.flatmap(lambda x: K[x], tmp)) for v in s: tmp.remove(v) for e in tmp: ext = list(s) ext.append(e) C.add(tuple(sorted(ext))) return list(C)
def build_people_vectors(self, d2v=False): """ """ # sender (avg vector of all message send) self.sender_rep = {} se = self.dataset.groupby("sender")["mid"].apply(list).to_dict() for k, v in se.iteritems(): if d2v: vecs = [self.d2v[mid] for mid in v] else: vecs = [self.wcbow[mid] for mid in v] self.sender_rep[k] = np.average(vecs, axis=0) # recipient (avg vector of all message received) self.recipient_rep = {} d = flatmap(self.dataset, "recipients", "recipient") \ .groupby("recipient")["mid"] \ .apply(list) \ .to_dict() for k, v in d.iteritems(): if d2v: vecs = [self.d2v[mid] for mid in v] else: vecs = [self.wcbow[mid] for mid in v] self.recipient_rep[k] = np.average(vecs, axis=0) return
def create_test_df(df, G, dr=False): """ """ df = df.copy() if not dr: G = G.to_undirected() # df.sender = df.sender.map(lambda x: self.mail2id[x]) # df.recipients = df.recipients.map( # lambda x: [self.mail2id[m] for m in x]) df["candidates"] = df.sender.map(lambda x: G.neighbors(x)) df_flat = flatmap(df, "candidates", "candidate") return df_flat
def make_flat_dataset(df, all_emails, fake_percent, num_cores=4, log=False): """ df is supposed to be the df obtained with load_dataset :) """ # create fake paires randomly df_neg = parallelize_dataframe(df, sample_neg_rec_df, num_cores=num_cores, all_emails=all_emails, percent=fake_percent) # flatten the recipients df_flat_rec = utils.flatmap(df, "recipients", "recipient") df_flat_neg = utils.flatmap(df_neg.drop("recipients", axis=1), "negs", "recipient") # add labels: 0 for fake recipient, 1 for others df_flat_rec["label"] = 1 df_flat_neg["label"] = 0 # concat neg and real recipient paires df_flat = pd.concat((df_flat_rec, df_flat_neg), axis=0) # return df_flat
def __init__(self, dataset, df_test=None): """ dataset: dataframe builded with "utils.load_dataset" """ self.dataset = dataset self.df_test = df_test # id2mail and mail2id mapping self.s_email = dataset.sender.unique() self.r_email = flatmap(dataset, "recipients", "recipient") \ .recipient.unique() self.id2email = sorted(list(set(self.r_email).union(self.s_email))) self.mail2id = dict((m, k) for k, m in enumerate(self.id2email)) self.num_doc = dataset.shape[0] self.doc = {} self.wcbow = {}
def build_recipient_sender_vectors(self, d2v=False): """ """ # recipient | sender rep df = self.dataset # flatten dataset df_flat = flatmap(df[["sender", "mid", "recipients"]], "recipients", "recipient") # groupby recipient then sender, collect list on mid rep = df_flat.groupby(["recipient", "sender"])["mid"].apply(list) \ .to_dict() # for tupl in rep.keys(): if d2v: rep[tupl] = np.average([self.d2v[m] for m in rep[tupl]], axis=0) else: rep[tupl] = np.average([self.wcbow[m] for m in rep[tupl]], axis=0) self.recipient_sender_rep = rep return
def outgoing_text_similarity(dataset, mid, user, twidf_df, n): """ Computing outgoing textual similarity TOO SLOW: was not used! """ # Dataset containing all previous emails sent by person 'user' dataset_from_rec = dataset[dataset.sender == user] # Measure similarity between the message of id 'mid' and all the messages sent dataset_similar = top_n_similarity(n, mid, dataset_from_rec, twidf_df) df_outgoing = pd.DataFrame( columns=['mid', 'user', 'contact', 'outgoing_text']) dataset_flat = utils.flatmap(dataset, "recipients", "recipient", np.string_) list_recipients = np.unique(dataset_flat['recipient'].tolist()) list_recipients_similar = np.unique( sum(dataset_similar['recipients'].tolist(), [])) df_outgoing['contact'] = pd.Series(list_recipients) df_outgoing['mid'] = mid df_outgoing['user'] = user df_outgoing['outgoing_text'] = pd.Series( [1 if c in list_recipients_similar else -1 for c in list_recipients]) return df_outgoing
def validate_neighbours(C, nbs): inv_idx = defaultdict(set) for idx in range(len(C)): for v in C[idx]: inv_idx[v].add(idx) real_nbs = defaultdict(set) for s, nb_set in nbs.items(): v_set = set(ut.flatmap(lambda x: C[x], nb_set)) for v in C[s]: v_set.discard(v) for v in v_set: if v > C[s][-1]: cand_joist = inv_idx[v] & nb_set if len(cand_joist) == len(C[s]): real_nbs[C[s]].add(v) for c in cand_joist: w = C[s][0] nxt = 1 while w in C[c]: w = C[s][nxt] nxt += 1 real_nbs[C[c]].add(w) return real_nbs
def useful_divisors(terms): threshold = None if args.exhaustive else KEY_LENGTH_THRESHOLD return flatmap(lambda n: list(utils.divisors(n, threshold))[1:], terms)
logger.info("Timeseries files path: " + ts_file) cf_file = experiment["files_path"]["preproc"]["noise_components"] #set up data dirs subjects_pref = map(lambda subject: '_subject_id_' + (subject), subject_list) sessions_subjects_dir = map( lambda session: map( lambda subject_pref: '_session_id_' + str(session) + subject_pref, subjects_pref), session_list) #flattened all filenames input_dirs = map( lambda session: map(lambda subj: op.join(data_dir, subj), session), sessions_subjects_dir) func_filenames = list( flatmap( lambda session: map(lambda subj: op.join(data_dir, subj, ts_file), session), sessions_subjects_dir)) confounds_components = list( flatmap( lambda session: map(lambda subj: op.join(data_dir, subj, cf_file), session), sessions_subjects_dir)) # update configuration file used number of components experiment["#components"] = args.n_components local_config["experiment"] = experiment update_experiment(local_config, args.config) split = experiment["split"] # Extract resting-state networks with CanICA # ---------------------------------
def get_all_policies(org): return get_org_policies(org)\ + flatmap(lambda f: get_folder_policies(f), get_folder_ids(org))\ + flatmap(lambda p: get_project_policies(p), get_project_ids() )