Exemplo n.º 1
0
def build_flat_dataset(df, G, emails, percent_neg=1, dr=False, n_cores=4):
    """ """
    df = df.copy()[["sender", "recipients", "mid"]]
    if not dr:
        G = G.to_undirected()
    # emails = set(self.id2email)
    emails = set(emails)
    # create fake paires randomly
    if n_cores > 1:
        df_neg = parallelize_dataframe(df,
                                       sample_neg_rec_df,
                                       num_cores=n_cores,
                                       emails=emails,
                                       G=G,
                                       percent_neg=percent_neg)
    else:
        df_neg = sample_neg_rec_df(df,
                                   emails=emails,
                                   G=G,
                                   percent_neg=percent_neg)
    # flatten the recipients
    df_flat_rec = flatmap(df, "recipients", "recipient")
    df_flat_neg = flatmap(df_neg.drop("recipients", axis=1), "negs",
                          "recipient")
    # add labels: 0 for fake recipient, 1 for others
    df_flat_rec["label"] = 1
    df_flat_neg["label"] = 0
    # concat neg and real recipient paires
    df_flat = pd.concat((df_flat_rec, df_flat_neg), axis=0)
    #
    return df_flat
Exemplo n.º 2
0
def scrap_rows_soups(context_message: str, rows_soups: Iterable[List[BeautifulSoup]]) -> List[ScrappedChapterRelease]:
    """ formats a row content to represent it as a ScrappedChapterRelease object"""
    scrapped_chapters = []
    for row_number, row_cells in enumerate(rows_soups):
        if len(row_cells) != EXPECTED_ELEM_BY_LINE:
            message = f"row {row_number} for {context_message} does not have 5 cells. Skipping it." \
                      f"\nRow was:\n {repr(row_cells)}"
            if len(row_cells) > 1:
                warnings.warn(message, ScrappingWarning)
            else:
                logging.info(message)
            continue

        volume_str = row_cells[VOL_COLUMN_INDEX].get_text()
        volume = None
        if volume_str.strip():
            try:
                volume = int(volume_str)
            except ValueError as e:
                warnings.warn(f"Failed to convert non empty volume str to int for {context_message}. Error was {e}",
                              ScrappingWarning)

        chapter_string = row_cells[CHAPTER_COLUMN_INDEX].get_text()

        group = row_cells[GROUPS_COLUMN_INDEX].get_text()
        chapters_elements = [chapter_string]
        for splitting_chars in SPLITTING_CHAPTER_CHARS:
            chapters_elements = flatmap(lambda elem: elem.split(splitting_chars), chapters_elements)
        # no interpolation as inference rule is too complex to code as of now given the diversity of possibilities.

        scrapped_chapters.extend(ScrappedChapterRelease(group, chapter, volume) for chapter in chapters_elements)
    return scrapped_chapters
Exemplo n.º 3
0
def extend_simplices(S, K, q):
    C = set()
    if q == 2 or len(S) == 0:
        for s in K:
            if len(s) == q:
                C.add(tuple(sorted(s)))
            elif len(s) > q:
                for c in combinations(s, q):
                    C.add(tuple(sorted(c)))
        return list(C)
    inv_index = defaultdict(set)
    for idx in range(len(K)):
        s = K[idx]
        if len(s) >= q:
            for v in s:
                inv_index[v].add(idx)
    if len(inv_index) == 0:
        return []
    for s in S:
        tmp = inv_index[s[0]]
        for idx in range(1, len(s)):
            tmp = tmp & inv_index[s[idx]]
        if len(tmp) > 0:
            tmp = set(ut.flatmap(lambda x: K[x], tmp))
            for v in s:
                tmp.remove(v)
            for e in tmp:
                ext = list(s)
                ext.append(e)
                C.add(tuple(sorted(ext)))
    return list(C)
Exemplo n.º 4
0
    def build_people_vectors(self, d2v=False):
        """ """
        # sender (avg vector of all message send)
        self.sender_rep = {}
        se = self.dataset.groupby("sender")["mid"].apply(list).to_dict()
        for k, v in se.iteritems():
            if d2v:
                vecs = [self.d2v[mid] for mid in v]
            else:
                vecs = [self.wcbow[mid] for mid in v]
            self.sender_rep[k] = np.average(vecs, axis=0)

        # recipient (avg vector of all message received)
        self.recipient_rep = {}
        d = flatmap(self.dataset, "recipients", "recipient") \
            .groupby("recipient")["mid"] \
            .apply(list) \
            .to_dict()
        for k, v in d.iteritems():
            if d2v:
                vecs = [self.d2v[mid] for mid in v]
            else:
                vecs = [self.wcbow[mid] for mid in v]
            self.recipient_rep[k] = np.average(vecs, axis=0)
        return
Exemplo n.º 5
0
def create_test_df(df, G, dr=False):
    """ """
    df = df.copy()
    if not dr:
        G = G.to_undirected()
    # df.sender = df.sender.map(lambda x: self.mail2id[x])
    # df.recipients = df.recipients.map(
    #     lambda x: [self.mail2id[m] for m in x])
    df["candidates"] = df.sender.map(lambda x: G.neighbors(x))
    df_flat = flatmap(df, "candidates", "candidate")
    return df_flat
Exemplo n.º 6
0
def make_flat_dataset(df, all_emails, fake_percent, num_cores=4, log=False):
    """
    df is supposed to be the df obtained with load_dataset :)
    """
    # create fake paires randomly
    df_neg = parallelize_dataframe(df,
                                   sample_neg_rec_df,
                                   num_cores=num_cores,
                                   all_emails=all_emails,
                                   percent=fake_percent)
    # flatten the recipients
    df_flat_rec = utils.flatmap(df, "recipients", "recipient")
    df_flat_neg = utils.flatmap(df_neg.drop("recipients", axis=1), "negs",
                                "recipient")
    # add labels: 0 for fake recipient, 1 for others
    df_flat_rec["label"] = 1
    df_flat_neg["label"] = 0
    # concat neg and real recipient paires
    df_flat = pd.concat((df_flat_rec, df_flat_neg), axis=0)
    #
    return df_flat
Exemplo n.º 7
0
 def __init__(self, dataset, df_test=None):
     """
     dataset: dataframe builded with "utils.load_dataset"
     """
     self.dataset = dataset
     self.df_test = df_test
     # id2mail and mail2id mapping
     self.s_email = dataset.sender.unique()
     self.r_email = flatmap(dataset, "recipients", "recipient") \
         .recipient.unique()
     self.id2email = sorted(list(set(self.r_email).union(self.s_email)))
     self.mail2id = dict((m, k) for k, m in enumerate(self.id2email))
     self.num_doc = dataset.shape[0]
     self.doc = {}
     self.wcbow = {}
Exemplo n.º 8
0
 def build_recipient_sender_vectors(self, d2v=False):
     """ """
     # recipient | sender rep
     df = self.dataset
     # flatten dataset
     df_flat = flatmap(df[["sender", "mid", "recipients"]], "recipients",
                       "recipient")
     # groupby recipient then sender, collect list on mid
     rep = df_flat.groupby(["recipient", "sender"])["mid"].apply(list) \
         .to_dict()
     #
     for tupl in rep.keys():
         if d2v:
             rep[tupl] = np.average([self.d2v[m] for m in rep[tupl]],
                                    axis=0)
         else:
             rep[tupl] = np.average([self.wcbow[m] for m in rep[tupl]],
                                    axis=0)
     self.recipient_sender_rep = rep
     return
Exemplo n.º 9
0
def outgoing_text_similarity(dataset, mid, user, twidf_df, n):
    """
    Computing outgoing textual similarity
    TOO SLOW: was not used!
    """
    # Dataset containing all previous emails sent by person 'user'
    dataset_from_rec = dataset[dataset.sender == user]
    # Measure similarity between the message of id 'mid' and all the messages sent
    dataset_similar = top_n_similarity(n, mid, dataset_from_rec, twidf_df)
    df_outgoing = pd.DataFrame(
        columns=['mid', 'user', 'contact', 'outgoing_text'])
    dataset_flat = utils.flatmap(dataset, "recipients", "recipient",
                                 np.string_)
    list_recipients = np.unique(dataset_flat['recipient'].tolist())
    list_recipients_similar = np.unique(
        sum(dataset_similar['recipients'].tolist(), []))
    df_outgoing['contact'] = pd.Series(list_recipients)
    df_outgoing['mid'] = mid
    df_outgoing['user'] = user
    df_outgoing['outgoing_text'] = pd.Series(
        [1 if c in list_recipients_similar else -1 for c in list_recipients])
    return df_outgoing
Exemplo n.º 10
0
def validate_neighbours(C, nbs):
    inv_idx = defaultdict(set)
    for idx in range(len(C)):
        for v in C[idx]:
            inv_idx[v].add(idx)
    real_nbs = defaultdict(set)
    for s, nb_set in nbs.items():
        v_set = set(ut.flatmap(lambda x: C[x], nb_set))
        for v in C[s]:
            v_set.discard(v)
        for v in v_set:
            if v > C[s][-1]:
                cand_joist = inv_idx[v] & nb_set
                if len(cand_joist) == len(C[s]):
                    real_nbs[C[s]].add(v)
                    for c in cand_joist:
                        w = C[s][0]
                        nxt = 1
                        while w in C[c]:
                            w = C[s][nxt]
                            nxt += 1
                        real_nbs[C[c]].add(w)
    return real_nbs
Exemplo n.º 11
0
def useful_divisors(terms):
    threshold = None if args.exhaustive else KEY_LENGTH_THRESHOLD
    return flatmap(lambda n: list(utils.divisors(n, threshold))[1:], terms)
Exemplo n.º 12
0
logger.info("Timeseries files path: " + ts_file)
cf_file = experiment["files_path"]["preproc"]["noise_components"]

#set up data dirs
subjects_pref = map(lambda subject: '_subject_id_' + (subject), subject_list)
sessions_subjects_dir = map(
    lambda session: map(
        lambda subject_pref: '_session_id_' + str(session) + subject_pref,
        subjects_pref), session_list)
#flattened all filenames
input_dirs = map(
    lambda session: map(lambda subj: op.join(data_dir, subj), session),
    sessions_subjects_dir)
func_filenames = list(
    flatmap(
        lambda session: map(lambda subj: op.join(data_dir, subj, ts_file),
                            session), sessions_subjects_dir))
confounds_components = list(
    flatmap(
        lambda session: map(lambda subj: op.join(data_dir, subj, cf_file),
                            session), sessions_subjects_dir))

# update configuration file used number of components
experiment["#components"] = args.n_components
local_config["experiment"] = experiment
update_experiment(local_config, args.config)
split = experiment["split"]

# Extract resting-state networks with CanICA

# ---------------------------------
Exemplo n.º 13
0
def get_all_policies(org):
    return get_org_policies(org)\
        + flatmap(lambda f: get_folder_policies(f), get_folder_ids(org))\
        + flatmap(lambda p: get_project_policies(p), get_project_ids() )