def main():

    # define arg parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--source_path', required=True)
    parser.add_argument('--dest_path', default='da_movs.feather')
    args = vars(parser.parse_args())

    # define path
    PROJ = Path()
    DATA = PROJ / 'dados'
    RAW = DATA / 'brutos'
    PROCESSED = DATA / 'processados'

    # load main data
    dados = pd.read_feather(args['source_path'])
    dados = dados.drop_duplicates('file_json')
    paths = list(set(dados['file_json'].str[3:].to_list()))
    paths = random.sample(paths, 1)

    # execute loops to read, process and join files
    kwargs = {'n_jobs': -2, 'verbose': 10}
    movs = Parallel(**kwargs)(delayed(parse_lawsuits)(p) for p in paths)
    movs = pd.concat(movs, ignore_index=True)
    movs = movs.astype(str)
    movs.to_feather(Path(args['dest_path']))
Exemplo n.º 2
0
def spectral(x, sf, f, stype, dcomplex, cycle, width, n_jobs):
    """Extract spectral informations from data.

    Parameters
    ----------
    x : array_like
        Array of data
    sf : float
        Sampling frequency
    f : array_like
        Frequency vector of shape (N, 2)
    stype : string
        Spectral informations to extract (use either 'pha' or 'amp')
    dcomplex : string
        Complex decomposition type. Use either 'hilbert' or 'wavelet'
    cycle : int
        Number of cycles to use for fir1 filtering.
    width : int
        Width of the wavelet.
    n_jobs : int
        Number of jobs to use. If jobs is -1, all of them are going to be
        used.
    """
    n_freqs = f.shape[0]
    # Filtering + complex decomposition :
    if dcomplex is 'hilbert':
        # get filtering coefficients
        b = []
        a = np.zeros((n_freqs,), dtype=float)
        forder = np.zeros((n_freqs,), dtype=int)
        for k in range(n_freqs):
            forder[k] = fir_order(sf, x.shape[-1], f[k, 0], cycle=cycle)
            _b, a[k] = fir1(forder[k], f[k, :] / (sf / 2.))
            b += [_b]
        # Filt each time series :
        xf = Parallel(n_jobs=n_jobs, **CONFIG['JOBLIB_CFG'])(delayed(filtfilt)(
            b[k], a[k], x, padlen=forder[k], axis=-1) for k in range(n_freqs))
        # Use hilbert for the complex decomposition :
        xd = np.asarray(xf)
        if stype is not None:
            xd = hilbertm(xd)
    elif dcomplex is 'wavelet':
        f = f.mean(1)  # centered frequencies
        xd = Parallel(n_jobs=n_jobs, **CONFIG['JOBLIB_CFG'])(delayed(morlet)(
            x, sf, k, width) for k in f)

    # Extract phase / amplitude :
    if stype is 'pha':
        return np.angle(xd).astype(np.float64)
    elif stype is 'amp':
        return np.abs(xd).astype(np.float64)
    elif stype is None:
        return xd.astype(np.float64)
def extract_all_class_features(dataset, n_jobs=1, stride=5, patch_size=10):
    """Extract masked features from all dataset images, return features and labels"""
    cns = []
    labels = []
    for (label, cls) in enumerate(dataset.classes):
        print 'Extracting masked CNs from class {}'.format(cls)
        hists = Parallel(n_jobs=n_jobs)(delayed(extract_masked_cns)(imname, maskname) for (imname, maskname) in dataset.get_class_images(cls))
        hists = np.vstack(hists)
        labels.append(label * np.ones((len(hists),), dtype=np.float32))
        cns.append(hists.astype(np.float32))
    
    # Stack lists in numpy arrays.
    return (cns, labels)
def main():

    # define paths
    PROJ = Path()
    DATA = PROJ / 'dados'
    RAW = DATA / 'brutos'
    PROCESSED = DATA / 'processados'

    # load digesto
    fpath = [PROCESSED / f'processos0{i}_movs.csv' for i in range(1, 4)]
    dados = pd.concat(
        [pd.read_csv(fp, low_memory=False, dtype=str) for fp in fpath],
        ignore_index=True)
    dados['numero_cnj'] = dados['numero_cnj'].str.replace(r'\-|\.', '')

    # load cnj inova
    inova = pd.read_feather(PROCESSED / 'da_basic_transform.feather')
    inova = inova[['file_json', 'rowid', 'numero']]
    inova = inova.dropna(subset=['numero'])

    # extrair os números cnj
    numero_cnj = dados['numero_cnj'].to_list()
    numero_cnj = set(numero_cnj)

    # filter inova lawsuits for which we can recover text
    inova = inova[inova['numero'].isin(numero_cnj)]
    dados = dados[dados['numero_cnj'].isin(inova['numero'])]

    # salvar as join keys e o banco do digesto
    inova.to_csv(PROCESSED / 'join_keys.csv', index=False)
    dados.to_csv(PROCESSED / 'movs_texto.csv', index=False, quoting=1)

    # produce list for extracting info from cnj inova
    inova = inova[['file_json', 'rowid']]
    inova = inova.groupby('file_json')['rowid'].apply(list)
    inova = inova.reset_index()
    inova['file_json'] = inova['file_json'].str[3:]
    inova = inova.itertuples(name=None, index=False)

    # execute loops to read, process and join files
    kwargs = {'n_jobs': -2, 'verbose': 10}
    movs = Parallel(**kwargs)(delayed(parse_lawsuits)(*p) for p in inova)
    movs = pd.concat(movs, ignore_index=True)
    movs = movs.astype(str)
    movs.to_csv(PROCESSED / 'movs_inova.csv', index=False, quoting=1)
Exemplo n.º 5
0
def fit(i):
    target = y_train[:, i].toarray().ravel()

    if target.mean() == 0:
        return np.zeros((X_test.shape[0],)) - 1

    d = LogisticRegression(max_iter=10)
    d.fit(X_train, target)
    return (d.predict_proba(X_test)[:, 1])

preds = Parallel(n_jobs=8, verbose=50)(delayed(fit)(i) for i in range(y_train.shape[1]))
preds = np.vstack(preds).T

# To reduce memory usage
preds = preds.astype(np.float16)

num = int(np.ceil(num_users * 0.05))

# Let's take not random users, but the ones who viewed a lot
users = train.loc[mask_test].user_id.value_counts().index[:num]
ans_inds =  np.argsort(preds[users])

test_inds_dict =  {k: list(ans_inds[i, -5:]) for i,k in enumerate(users)}
scorer(y_val_dict, test_inds_dict, num_users=num_users)

# For each user find the categories, which we do not want to predict
last_3weeks = train.loc[mask_test].loc[train.loc[mask_test].date >= train.loc[mask_test].date.max() - 21 + 1]
y_not = last_3weeks.groupby('user_id').id3.apply(set)

y_pred = {}
def task7(args):
    """
    Core logic for task7
    """
    num_cores = multiprocessing.cpu_count()
    k = int(args.k)
    tensorFileName = "userImageLocation-tensor.npy"
    factorMatricesFileName = "factor-matrices" + str(k) + ".npy"
    cwd = os.getcwd()

    # Load each object space into dictionary where d_obj = {'id': {'term': df, ...}, ...}
    print("Loading User Space...")
    userFile = '../Data/devset_textTermsPerUser.txt'
    d_user = read_text_descriptor_files(userFile)

    print("Loading Image Space...")
    imageFile = '../Data/devset_textTermsPerImage.txt'
    d_images = read_text_descriptor_files(imageFile)

    print("Loading Location Space...")
    locFile = '../Data/devset_textTermsPerPOI.txt'
    d_locations = read_text_descriptor_files(locFile)

    user_list = list(d_user.keys())
    image_list = list(d_images.keys())
    loc_list = list(d_locations.keys())
    print(len(user_list), len(image_list), len(loc_list))

    if os.path.exists(cwd + '/' + tensorFileName):
        print("Loading Tensor...")
        tensor = np.load(tensorFileName)
    else:
        print("Creating Tensor...")

        def processInput(i):
            # Create a slice of 3-D tensor (combinations of loc & image per user)
            print('Started for user' + str(i))
            user = user_list[i]
            array = [[0 for _ in range(len(loc_list))]
                     for _ in range(len(image_list))]
            for j in range(len(image_list)):
                for l in range(len(loc_list)):
                    image = image_list[j]
                    loc = loc_list[l]
                    # Number of terms shared by all three entities
                    union_words = d_user[user].keys() & d_images[image].keys(
                    ) & d_locations[loc].keys()
                    array[j][l] += len(union_words)

            print('Ended for user' + str(i))
            return array

        tensor = Parallel(n_jobs=num_cores - 1)(delayed(processInput)(i)
                                                for i in range(len(user_list)))
        tensor = np.array(tensor)
        print(tensor.shape)
        np.save(tensorFileName, tensor)

    print('Tensor created')

    if not os.path.exists(cwd + '/' + factorMatricesFileName):
        # Perform CP decomposition via ALS
        tensor = tensor.astype(float)
        print("Performing CP Decomposition...")
        factors = parafac(tensor=tensor, rank=k, n_iter_max=150, init='random')
        np.save(factorMatricesFileName, factors)
    else:
        factors = np.load(factorMatricesFileName)

    print("Factor Matrices created")

    indexToSpaceIds = {0: user_list, 1: image_list, 2: loc_list}

    def createGroups(factor_index):
        # Create k non-overlapping groups
        f_matrix = factors[factor_index]  # factor matrix to be used
        groups = []
        for i in range(k):
            groups.append([])

        for j in range(f_matrix.shape[0]):
            # Assign object to one of k groups/latent-features that it has highest membership towards
            object_id = indexToSpaceIds[factor_index][
                j]  # Map indices back to user/image/location id's
            group_index = np.argmax(f_matrix[j])
            groups[group_index].append(object_id)

        return groups

    groupsList = Parallel(n_jobs=num_cores - 1)(delayed(createGroups)(i)
                                                for i in [0, 1, 2])

    # Output Results
    with open("task7_output.txt", "w") as f:
        userGroups = groupsList[0]
        f.write("\n********** K-USER GROUPS **********\n")
        for i in range(k):
            printGroups(userGroups, i, f)

        imageGroups = groupsList[1]
        f.write("\n********** K-IMAGE GROUPS **********\n")
        for i in range(k):
            printGroups(imageGroups, i, f)

        locGroups = groupsList[2]
        f.write("\n********** K-LOCATION GROUPS **********\n")
        for i in range(k):
            printGroups(locGroups, i, f)
Exemplo n.º 7
0
def inferred_bsa(job, dataset_name, cdd, cores=NUM_WORKERS):
    job.log("INF CDD {}".format(cdd))
    cdd_bsa_path = os.path.join(get_interfaces_path(dataset_name),
                                "by_superfamily", str(int(cdd)), str(int(cdd)))

    if not os.path.isfile(cdd_bsa_path + "_bsa.h5"):
        job.log("observed bsa must exist")
        return

    print("Reading obs bsa")
    store = pd.HDFStore(unicode(cdd_bsa_path + "_bsa.h5"))

    # if "/inferred" in store.keys():
    #     return

    try:
        cdd_obs_bsa = store.get("/observed")
    except KeyError:
        raise RuntimeError("Must calculate observed BSAs first")

    try:
        cdd_obs_bsa = cdd_obs_bsa[[
            "obs_int_id", "bsa", "c1_asa", "c2_asa", "face1_asa", "face2_asa",
            "complex_asa", "ppi_type"
        ]]
    except KeyError:
        job.log("Failed due to column select {}".format(cdd_obs_bsa.columns))
        raise

    cdd_obs_bsa = cdd_obs_bsa.rename(
        columns={
            "obs_int_id": "nbr_obs_int_id",
            "bsa": "obs_bsa",
            "c1_asa": "c1_asa_obs",
            "face1_asa": "face1_asa_obs",
            "complex_asa": "complex_asa_obs",
            "ppi_type": "ppi_type_obs"
        })

    inf_interactome_path = unicode(cdd_bsa_path + ".inferred_interactome")
    try:
        print("Reading  inf interactome")
        int_store = pd.HDFStore(unicode(cdd_bsa_path +
                                        ".inferred_interactome"))
        if "/table" not in int_store.keys():
            return
        m = re.search("nrows->(\d+)", int_store.info())
        if not m:
            int_store.close()
            job.log("Unable to read inferred interactome")
            return

        if int(m.group(1)) > 1000000:
            int_store.close()
            return inferred_bsa_dask(cdd_obs_bsa, cdd_bsa_path)
        inf_interactome = int_store.get(
            "/table"
        )  #pd.read_hdf(unicode(cdd_bsa_path+".inferred_interactome"), "table").reset_index()
    except MemoryError:
        return inferred_bsa_dask(cdd_obs_bsa, cdd_bsa_path)

    if inf_interactome.shape[0] > 1000000:
        int_store.close()
        del inf_interactome
        return inferred_bsa_dask(cdd_obs_bsa, cdd_bsa_path)

    inf_interactome = pd.merge(inf_interactome,
                               cdd_obs_bsa,
                               how="left",
                               on="nbr_obs_int_id")

    #Remove redundant interfaces
    inf_interactome = inf_interactome.groupby(
        ["mol_sdi", "nbr_obs_int_id", "mol_sdi_from", "mol_sdi_to"],
        as_index=False).nth(0).reset_index(drop=True).copy()

    bsa = Parallel(n_jobs=NUM_WORKERS)(delayed(get_asa)(group) for _, group in \
        inf_interactome.groupby(["mol_sdi", "nbr_obs_int_id"], as_index=False))
    bsa = pd.concat(bsa, axis=1).T
    bsa = bsa.astype({
        "mol_sdi": np.float64,
        "nbr_obs_int_id": np.float64,
        "c1_asa": np.float64,
        "face1_asa": np.float64,
        "bsa": np.float64,
        "complex_asa": np.float64,
        "pred_ratio": np.float64,
        "ppi_type": str
    })

    inf_interactome = pd.merge(inf_interactome,
                               bsa,
                               how="left",
                               on=["mol_sdi", "nbr_obs_int_id"])

    inf_interactome.to_hdf(unicode(cdd_bsa_path + "_bsa.h5"),
                           "inferred",
                           format='table',
                           append=True,
                           complevel=9,
                           complib="bzip2")
    print(unicode(cdd_bsa_path + "_bsa.h5"))
    int_store.close()
Exemplo n.º 8
0
def observed_bsa(job, dataset_name, cdd, cores=NUM_WORKERS):
    job.log("CDD {}".format(cdd))
    prefix = os.path.join(get_interfaces_path(dataset_name), "by_superfamily",
                          str(int(cdd)), str(int(cdd)))

    # if os.path.isfile(prefix+"_bsa.h5"):
    #     store = pd.HDFStore(unicode(prefix+"_bsa.h5"))
    #     if "/observed" in store.keys():
    #         store.close()
    #         return
    #     store.close()

    cdd_interactome_path = prefix + ".observed_interactome"

    cdd_interactome = pd.read_hdf(unicode(cdd_interactome_path), "table")

    if cdd_interactome.shape[0] == 0:
        job.log("CDD observed interactome is empty -- FIX!!!")
        return

    if cdd_interactome.shape[0] == 0:
        job.log(
            "CDD observed interactome contains intra-chain PPI, skipped -- FIX!!!"
        )
        return

    #Remove redundant interfaces
    cdd_interactome = cdd_interactome.groupby(
        ["obs_int_id", "mol_sdi_from", "mol_sdi_to"],
        as_index=False).nth(0).reset_index(drop=True).copy()

    if "mol_sdi" in cdd_interactome:
        key = "mol_sdi"
    elif "mol_sdi_id" in cdd_interactome:
        key = "mol_sdi_id"
    else:
        raise RuntimeError("sdi not in df")

    bsa = Parallel(n_jobs=NUM_WORKERS)(delayed(get_bsa)(group) for _, group in \
        cdd_interactome.groupby(key, as_index=False))
    bsa = pd.concat(bsa, axis=1).T
    bsa[key] = bsa[key].astype(int)
    bsa = bsa.astype({
        "bsa": np.float64,
        "c1_asa": np.float64,
        "c2_asa": np.float64,
        "complex_asa": np.float64,
        "face1_asa": np.float64,
        "face2_asa": np.float64,
        "ppi_type": str
    })

    cdd_interactome = pd.merge(cdd_interactome, bsa, how="left", on=key)

    cdd_interactome.to_hdf(unicode(prefix + "_bsa.h5"),
                           "observed",
                           table=True,
                           format='table',
                           complevel=9,
                           complib="bzip2")
    print(unicode(prefix + "_bsa.h5"))