Exemplo n.º 1
0
def test_select():
    df = diamonds[['carat','cut','price']]
    assert df.equals(diamonds >> select('carat','cut','price'))
    assert df.equals(diamonds >> select(0, 1, 6))
    assert df.equals(diamonds >> select(0, 1, 'price'))
    assert df.equals(diamonds >> select([0, X.cut], X.price))
    assert df.equals(diamonds >> select(X.carat, X['cut'], X.price))
    assert df.equals(diamonds >> select(X[['carat','cut','price']]))
    assert df.equals(diamonds >> select(X[['carat','cut']], X.price))
    assert df.equals(diamonds >> select(X.iloc[:,[0,1,6]]))
    assert df.equals(diamonds >> select([X.loc[:, ['carat','cut','price']]]))
Exemplo n.º 2
0
def test_cummin():
    df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
    df_cm = df >> mutate(cm=cummin(X.x))
    df_truth = df
    df_truth['cm'] = pd.Series([3.95, 3.89, 3.89, 3.89, 3.89])
    assert df_cm.equals(df_truth)
    df_cm = df >> groupby(X.cut) >> mutate(cm=cummin(X.x))
    df_truth['cm'] = pd.Series([3.95, 3.89, 4.05, 3.89, 4.05])
    assert df_cm.equals(df_truth)
Exemplo n.º 3
0
def test_cumsum():
    df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
    df_cs = df >> mutate(cs=cumsum(X.x))
    df_truth = df
    df_truth['cs'] = pd.Series([3.95, 7.84, 11.89, 16.09, 20.43])
    assert df_cs.equals(df_truth)
    df_cs = df >> groupby(X.cut) >> mutate(cs=cumsum(X.x))
    df_truth['cs'] = pd.Series([3.95, 3.89, 4.05, 8.09, 8.39])
    assert df_cs.equals(df_truth)
Exemplo n.º 4
0
def test_cummean():
    df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
    df_cm = df >> mutate(cm=cummean(X.x))
    df_truth = df
    df_truth['cm'] = pd.Series([3.950000, 3.920000, 3.963333, 4.022500, 4.086000])
    assert df_cm.equals(df_truth)
    df_cm = df >> groupby(X.cut) >> mutate(cm=cummean(X.x))
    df_truth['cm'] = pd.Series([3.950, 3.890, 4.050, 4.045, 4.195])
    assert df_cm.equals(df_truth)
Exemplo n.º 5
0
def test_cumprod():
    df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
    df_cp = df >> mutate(cp=cumprod(X.x))
    df_truth = df
    df_truth['cp'] = pd.Series([3.950000, 15.365500, 62.230275, 261.367155, 1134.333453])
    assert df_cp.equals(df_truth)
    df_cp = df >> groupby(X.cut) >> mutate(cp=cumprod(X.x))
    df_truth['cp'] = pd.Series([3.950, 3.890, 4.050, 16.338, 17.577])
    # some tricky floating point stuff going on here
    diffs = df_cp.cp - df_truth.cp
    assert all(diffs < .0000001)
Exemplo n.º 6
0
def test_min_rank():
    df = diamonds.copy() >> head(5) >> select(X.cut, X.x)
    df_mr = df >> mutate(mr=min_rank(X.x))
    df_truth = df
    df_truth['mr'] = pd.Series([2.0, 1.0, 3.0, 4.0, 5.0])
    assert df_mr.equals(df_truth)
    df_mr = df >> mutate(mr=min_rank(X.cut))
    df_truth['mr'] = pd.Series([3.0, 4.0, 1.0, 4.0, 1.0])
    assert df_mr.equals(df_truth)
    df_mr = df >> groupby(X.cut) >> mutate(mr=min_rank(X.x))
    df_truth['mr'] = pd.Series([1.0, 1.0, 1.0, 2.0, 2.0])
    assert df_mr.equals(df_truth)
    df_mr = df >> mutate(mr=min_rank(X.x, ascending=False))
    df_truth['mr'] = pd.Series([4.0, 5.0, 3.0, 2.0, 1.0])
    assert df_mr.equals(df_truth)
Exemplo n.º 7
0
    def exec(self):

        try:
            log.info('[START] {}'.format("exec"))

            # breakpoint()

            fileInfoPattrn = '{}/{}/{}'.format(globalVar['inpPath'], serviceName, '1.csv')
            fileInfo = glob.glob(fileInfoPattrn)
            if (len(fileInfo) < 1): raise Exception("[ERROR] fileInfo : {} : {}".format("자료를 확인해주세요.", fileInfoPattrn))

            data = pd.read_csv(fileInfo[0], skiprows = 16)

            # (Pdb) >? dataL1.columns
            # Index(['DateTime', 'Latitude', 'L Sensing Latitude', 'R Sensing Latitude',
            #        'Longitude', 'L Sensing Longitude', 'R Sensing Longitude',
            #        'Sensor R S1', 'Sensor L S1', 'Cropspec Root S1'],
            #       dtype='object')

            dataL1 = (
                    (
                        data >>
                        dfply.select(
                            dfply.X['DateTime']
                            , dfply.X['Latitude']
                            , dfply.X['Longitude']
                            , dfply.X['Cropspec Root S1']

                            , dfply.X['L Sensing Latitude']
                            , dfply.X['L Sensing Longitude']
                            , dfply.X['Sensor L S1']

                            , dfply.X['R Sensing Latitude']
                            , dfply.X['R Sensing Longitude']
                            , dfply.X['Sensor R S1']
                        )
                    )
            )

            dataL2 = dataL1.replace(0, np.nan)\
                .dropna(axis = 0)

            dataL3 = pd.concat([
                pd.DataFrame(dataL2[['DateTime', 'Latitude', 'Longitude', 'Cropspec Root S1']]).set_axis(['DateTime', 'y', 'x', 'S1'], axis = 1, inplace=False)
                , pd.DataFrame(dataL2[['DateTime', 'L Sensing Latitude', 'L Sensing Longitude', 'Sensor L S1']]).set_axis(['DateTime', 'y', 'x', 'S1'], axis = 1, inplace=False)
                , pd.DataFrame(dataL2[['DateTime', 'R Sensing Latitude', 'R Sensing Longitude', 'Sensor R S1']]).set_axis(['DateTime', 'y', 'x', 'S1'], axis = 1, inplace=False)
            ]
                ,  axis = 0
            )

            dataL4 = dataL3.sort_values(by=['DateTime'], axis=0)

            saveFile = '{}/{}_{}'.format(globalVar['outPath'], serviceName, '2021_nagano_S1_01_raw.csv')
            log.info('[CHECK] saveFile : {}'.format(saveFile))

            dataL4.to_csv(saveFile, index=False)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Exemplo n.º 8
0
    def exec(self):

        try:
            log.info('[START] {}'.format("exec"))

            fileInfo1 = glob.glob('{}/{}'.format(globalVar['inpPath'],
                                                 '/LSH0183/result/reply.csv'))
            if (len(fileInfo1) < 1):
                raise Exception("[ERROR] fileInfo1 : {}".format("자료를 확인해주세요."))

            replyData = (
                (pd.read_csv(fileInfo1[0]) >> dfply.mutate(
                    title='', view='', content=dfply.X.reply, flag='reply') >>
                 dfply.select(dfply.X.idx_no, dfply.X.title, dfply.X.content,
                              dfply.X.nick, dfply.X.date, dfply.X.view,
                              dfply.X.flag, dfply.X.thread)))

            contentInfo = glob.glob('{}/{}'.format(
                globalVar['inpPath'], '/LSH0183/INPUT/CONTENT_RESULT.xlsx'))
            if (len(contentInfo) < 1):
                raise Exception(
                    "[ERROR] contentInfo : {}".format("자료를 확인해주세요."))

            sheetList = ['황반변성', '비오뷰', '루센티스', '아일리아', '아바스틴']

            # breakpoint()

            # sheetInfo = sheetList[0]
            for sheetInfo in sheetList:
                log.info('[CHECK] sheetInfo : {}'.format(sheetInfo))

                keyData = ((pd.read_excel(contentInfo[0], sheet_name=sheetInfo)
                            >> dfply.filter_by(dfply.X.flag == 'content') >>
                            dfply.mutate(thread='')))

                data = pd.DataFrame()
                for i in range(len(keyData)):
                    keyDataL1 = ((keyData >> dfply.filter_by(
                        dfply.X.idx_no == keyData['idx_no'][i],
                        dfply.X.view != None
                    ) >> dfply.mutate(url=(
                        "https://cafe.naver.com/maculardegeneration?iframe_url_utf8=%2FArticleRead.nhn%253Fclubid%3D21788988%2526page%3D1%2526boardtype%3DL%2526articleid%3D{}%2526referrerAllArticles%3Dtrue"
                    ).format(keyData['idx_no'][i]))))

                    replyDataL1 = ((replyData >> dfply.filter_by(
                        dfply.X.idx_no == keyData['idx_no'][i],
                        dfply.X.thread != '') >> dfply.mutate(url='')))

                    # 행 단위로 추가
                    data = pd.concat([data, keyDataL1, replyDataL1], axis=0)

                saveFile = '{}/{}_키워드_{}.xlsx'.format(globalVar['outPath'],
                                                      serviceName, sheetInfo)
                log.info('[CHECK] saveFile : {}'.format(saveFile))

                data.to_excel(saveFile, index=False)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e
        finally:
            log.info('[END] {}'.format("exec"))
Exemplo n.º 9
0
    def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims):
        data = pd.DataFrame.from_dict({
            'feature':
            similarity.index.values.tolist(),
            'dim1':
            mapping.iloc[:, 0].tolist(),
            'dim2':
            mapping.iloc[:, 1].tolist(),
            'dim3':
            mapping.iloc[:, 2].tolist()
        })

        if ranked:
            data = data >> arrange(
                X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1)
            data = data >> arrange(
                X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1)

        data = data >> arrange(X.dim1)
        data = data >> mutate(
            resize_x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_x = data.resize_x.astype(int)
        data = data >> arrange(X.dim2)
        data = data >> mutate(
            resize_y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_y = data.resize_y.astype(int)

        data2 = pd.DataFrame.from_dict({
            'rot_x': data.resize_x,
            'rot_y': data.resize_y
        })
        data2 = rotate_2_col_mat(data2, angle)
        data = data >> bind_cols(data2)
        del data2

        data = data >> arrange(X.rot_x)
        data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.x = data.x.astype(int)

        data = data >> arrange(X.rot_y)
        data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.y = data.y.astype(int)

        data = data >> arrange(X.dim3)

        data2 = {}
        data2['X'] = data >> select(X.x, X.y)
        data2['Y'] = data2['X'].drop_duplicates()
        data2['X'] = np.arange(data2['Y'].shape[0])
        data2['Z'] = data
        for i in data2['X']:
            data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i])
            data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i])
            data2['result.z'] = np.arange(data2['result'].shape[0]) + 1
            data2['result.z'] = data2['result.z'].tolist()
            data2['result'] = data2['result'] >> mutate(z=data2['result.z'])
            if i == 0:
                data2['results'] = data2['result']
            else:
                data2['results'] = pd.DataFrame.append(data2['results'],
                                                       data2['result'])
        data = data2['results']
        del data2

        data2a = similarity.index.values
        data2b = data >> mask(data.feature.isin(data2a))
        data2a = pd.DataFrame.from_dict({'feature': data2a})
        data2a = data2a >> mask(
            data2a.feature.isin(data2b['feature'].to_numpy()))
        data = data2a >> left_join(data2b, by='feature')
        del data2a, data2b

        data = data.set_index('feature')
        data = data >> select(X.x, X.y, X.z)
        data = data >> arrange(X.z, X.y, X.x)

        return data
Exemplo n.º 10
0
def read(path):
    """
  Read a .ts.tar.gz file to a TidySet
  
  This function read multiple files archived by tar with gzip compression
  to a TidySet.
  
  :param path: A character of .ts.tar.gz file path (include file extension).
  :return: output A TidySet, an ExpressionSet with three tables. Function of
  write_ts_tar_gz can write this file from the TidySet.
  """

    filename = path
    path = re.sub('.ts.tar.gz', '', filename)
    os.mkdir(path)
    tar = tarfile.open(filename)
    tar.extractall(path)
    tar.close()

    f = open(path + '/others.txt', 'r')
    others = f.read()
    f.close()

    other = re.split('\n', others)
    elements = []
    for i in np.arange(len(other)):
        if re.search('^>>', other[i]):
            elements.append(i)

    XX = np.arange(len(elements)).tolist()
    Y = elements
    Z = np.arange(len(other)).tolist()
    K = []
    for i in XX:
        if i < (len(Y) - 1):
            L = Z[(Y[i] + 1):Y[i + 1]]
        else:
            L = Z[(Y[i] + 1):]
        K.append(L)

    XX = np.arange(len(K))
    Y = K
    Z = other
    K = []
    for i in elements:
        K.append(re.sub('>>', '', other[i]))
    M = dict()
    for i in XX:
        L = []
        for j in Y[i]:
            L.append(Z[j])
        L = ' '.join(L)
        M[K[i]] = L
    others = M
    del XX, Y, Z, K, L, M, i, j, f, other, elements

    adata = pd.read_csv(path + '/exprs.csv',
                        names=re.split('\\s', others['sampleNames']))
    adata.index = re.split('\\s', others['featureNames'])

    pdata_names = re.split('\\s', others['varLabels'])
    pdata_dtype = re.split('\\s', others['varClass'])
    pdata = dict()
    for i in np.arange(len(pdata_dtype)):
        if pdata_dtype[i] == 'numeric':
            pdata[pdata_names[i]] = 'float64'
        elif pdata_dtype[i] == 'integer':
            pdata[pdata_names[i]] = 'int64'
        elif pdata_dtype[i] == 'factor':
            pdata[pdata_names[i]] = 'category'
        else:
            pdata[pdata_names[i]] = 'object'
    pdata = pd.read_csv(path + '/pData.csv', names=pdata_names, dtype=pdata)
    pdata.index = re.split('\\s', others['sampleNames'])
    string = re.split('\\s', others['varMetadata'])
    i = 0
    for c in string:
        if string[i] == 'NA': string[i] = np.NaN
        i += 1
    pmetadata = pd.DataFrame(string,
                             index=re.split('\\s', others['varLabels']),
                             columns=['labelDescription'])
    pdata = AnnotatedDataFrame(pdata, pmetadata)

    fdata_names = re.split('\\s', others['fvarLabels'])
    fdata_dtype = re.split('\\s', others['fvarClass'])
    fdata = dict()
    for i in np.arange(len(fdata_dtype)):
        if fdata_dtype[i] == 'numeric':
            fdata[fdata_names[i]] = 'float64'
        elif fdata_dtype[i] == 'integer':
            fdata[fdata_names[i]] = 'int64'
        elif fdata_dtype[i] == 'factor':
            fdata[fdata_names[i]] = 'category'
        else:
            fdata[fdata_names[i]] = 'object'
    fdata = pd.read_csv(path + '/fData.csv', names=fdata_names, dtype=fdata)
    fdata.index = re.split('\\s', others['featureNames'])
    fdata.index.name = 'pos_id'

    sim_names = re.split('\\s', others['simNames'])
    sim_dtype = dict()
    for i in np.arange(len(sim_names)):
        sim_dtype[sim_names[i]] = 'float64'
    similarity = pd.read_csv(path + '/similarity.csv',
                             names=sim_names,
                             dtype=sim_dtype)
    similarity.index = sim_names

    ontomap = adata.transpose()
    for i in np.arange(ontomap.columns.values.shape[0]):
        dim = re.split('x|y|z', ontomap.columns.values[i])
        if i > 0:
            dim[1] = np.max([int(dim[1]), int(dim_[1])])
            dim[2] = np.max([int(dim[2]), int(dim_[2])])
            dim[3] = np.max([int(dim[3]), int(dim_[3])])
        dim_ = dim
    del dim_
    ontomap = ontomap.to_numpy()
    ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1])
    ontomap = np.array(ontomap)
    ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3])

    ontotype = {}
    for i in np.arange(fdata.shape[1] - 1):
        data0 = fdata >> select(~X.feature)
        data = data0.iloc[:, i]
        data = data.reset_index(inplace=False)
        data = data.rename(columns={data.columns.values[1]: 'ontotype'})
        data = data >> mask(X.ontotype == 1)
        data = data >> left_join(fdata.reset_index(inplace=False),
                                 var='pos_id')
        data = data >> select(X.pos_id, X.feature)

        data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
        data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature')

        ontotype[data0.columns.values[i]] = data2

    ontotype['root'] = fdata.reset_index(inplace=False) >> select(
        X.pos_id, X.feature)
    ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str))
    ontotype['root'] = ontotype['root'] >> mask(
        X.f_str != 'nan') >> select(~X.f_str)
    ontotype['root'] = ontotype['root'] >> separate(
        X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
    ontotype['root'] = ontotype['root'][['feature', 'x', 'y',
                                         'z']].set_index('feature')

    string = re.split('\\s', others['fvarMetadata'])
    i = 0
    for c in string:
        if string[i] == 'NA': string[i] = np.NaN
        i += 1
    fmetadata = pd.DataFrame(string,
                             index=re.split('\\s', others['fvarLabels']),
                             columns=['labelDescription'])
    fdata.index.name = None
    fdata = AnnotatedDataFrame(fdata, fmetadata)

    ontology_names = re.split('\\s', others['ontoNames'])
    ontology_dtype = re.split('\\s', others['ontoClass'])
    ontology = dict()
    for i in np.arange(len(ontology_dtype)):
        if ontology_dtype[i] == 'numeric':
            ontology[ontology_names[i]] = 'float64'
        elif ontology_dtype[i] == 'integer':
            ontology[ontology_names[i]] = 'int64'
        elif ontology_dtype[i] == 'factor':
            ontology[ontology_names[i]] = 'category'
        else:
            ontology[ontology_names[i]] = 'object'
    ontology = pd.read_csv(path + '/ontology.csv',
                           names=ontology_names,
                           dtype=ontology)

    xData = MIAME(name=others['name'],
                  lab=others['lab'],
                  contact=others['contact'],
                  title=others['title'],
                  abstract=others['abstract'],
                  url=others['url'],
                  pubMedIds=others['pubMedIds'],
                  other={
                      'similarity': similarity,
                      'ontomap': ontomap,
                      'ontotype': ontotype,
                      'ontology': ontology
                  })

    for i in os.listdir(path):
        os.remove(path + '/' + i)
    os.rmdir(path)

    if re.match('^( +)', others['annotation']):
        annot = ''
    else:
        annot = re.sub(' +', ' ', others['annotation'])

    eset = ExpressionSet(assayData=adata.to_numpy(),
                         phenoData=pdata,
                         featureData=fdata,
                         experimentData=xData,
                         annotation=annot)

    return eset
Exemplo n.º 11
0
def compile(value,
            outcome,
            similarity,
            mapping,
            ontology,
            ranked=True,
            dims=7,
            decreasing=False,
            seed_num=33):
    """
  Make a TidySet for visible neural network (VNN) modeling
  
  This function create a TidySet, an ExpressionSet class to orchestrate five
  data into a single set of three tables.
  
  :param value: Instance-feature value, a pandas data frame with rows for
  instances and columns for features. All rows in value should have names. All
  values should be floating numbers.
  :param outcome: Outcome, a single-column pandas data frame of binary integers
  with the same rows as the instances. The row numbers and the order of outcome
  should be the same with those of value. Value  of 0 and 1 should refer to
  non-event and event outcome, respectively.
  :param similarity: Feature similarity, a square pandas data frame of floating
  numbers containing feature-feature similarity measures.
  :param mapping: Feature three-dimensional mapping, a pandas data frame of
  floating numbers with rows for features and three columns for three dimensions
  where the features are mapped onto.
  :param ontology: Ontology, a pandas data frame with rows for ontologies and
  four columns for source, target, similarity, and relation. Feature (source)-
  ontology (target) relation should be annotated as 'feature', while ontology-
  ontology relation should be annotated as 'is_a'. To differentiate between
  feature and ontology names, a prefix of 'ONT:' precedes an ontology name. All
  columns except similarity in ontology should be strings. Similarity (a
  floating number) is a minimum threshold by which either features or ontologies
  (source) belong to an ontology (target).
  :return: output TidySet, an ExpressionSet with three tables. Instance-feature
  value and outcome pandas data frame are compiled as a phenotype pandas data
  frame with rows for instances and columns for features and outcome. Instance-
  feature value and feature three-dimensional mapping pandas data frame are
  compiled as an expression two-dimensional array with rows for positions of
  features and columns for instances. The mapping, similarity, and ontology
  pandas data frame are compiled as a feature pandas data frame with rows for
  positions of features and columns for feature names and ontological relations.
  For easier access, the similarity pandas data frame, ontomap four-dimensional
  numpy array, ontotype dictionary of pandas data frame, and ontology pandas
  data frame are included in experiment notes that can be called using function
  of notes.
  """

    pb = ProgressBar(8)
    tick = 0
    pb.start()

    # Leibniz formula for pi
    # https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80
    # pi=1
    # for i in range(1,int(10e+6)):
    #   pi+=((-1)**i)*(1/(2*i+1))
    # pi=pi*4

    tick += 1
    pb.update(tick)  #1

    def rotate_2_col_mat(X, angle):
        angle = (math.pi / 180 * angle) * -1
        M = np.array([
            math.cos(angle),
            math.sin(angle), -math.sin(angle),
            math.cos(angle)
        ])
        M = M.reshape(2, 2)
        M = np.dot(X.to_numpy(), M)
        M = pd.DataFrame(M,
                         index=X.index.values.tolist(),
                         columns=X.columns.values.tolist())
        return M

    def create_fmap(mapping, similarity, angle, ranked=ranked, dims=dims):
        data = pd.DataFrame.from_dict({
            'feature':
            similarity.index.values.tolist(),
            'dim1':
            mapping.iloc[:, 0].tolist(),
            'dim2':
            mapping.iloc[:, 1].tolist(),
            'dim3':
            mapping.iloc[:, 2].tolist()
        })

        if ranked:
            data = data >> arrange(
                X.dim1) >> mutate(dim1=np.arange(data.shape[0]) + 1)
            data = data >> arrange(
                X.dim2) >> mutate(dim2=np.arange(data.shape[0]) + 1)

        data = data >> arrange(X.dim1)
        data = data >> mutate(
            resize_x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_x = data.resize_x.astype(int)
        data = data >> arrange(X.dim2)
        data = data >> mutate(
            resize_y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.resize_y = data.resize_y.astype(int)

        data2 = pd.DataFrame.from_dict({
            'rot_x': data.resize_x,
            'rot_y': data.resize_y
        })
        data2 = rotate_2_col_mat(data2, angle)
        data = data >> bind_cols(data2)
        del data2

        data = data >> arrange(X.rot_x)
        data = data >> mutate(x=np.round_(np.linspace(1, dims, data.shape[0])))
        data.x = data.x.astype(int)

        data = data >> arrange(X.rot_y)
        data = data >> mutate(y=np.round_(np.linspace(1, dims, data.shape[0])))
        data.y = data.y.astype(int)

        data = data >> arrange(X.dim3)

        data2 = {}
        data2['X'] = data >> select(X.x, X.y)
        data2['Y'] = data2['X'].drop_duplicates()
        data2['X'] = np.arange(data2['Y'].shape[0])
        data2['Z'] = data
        for i in data2['X']:
            data2['result'] = data2['Z'] >> mask(X.x == data2['Z'].x[i])
            data2['result'] = data2['result'] >> mask(X.y == data2['Z'].y[i])
            data2['result.z'] = np.arange(data2['result'].shape[0]) + 1
            data2['result.z'] = data2['result.z'].tolist()
            data2['result'] = data2['result'] >> mutate(z=data2['result.z'])
            if i == 0:
                data2['results'] = data2['result']
            else:
                data2['results'] = pd.DataFrame.append(data2['results'],
                                                       data2['result'])
        data = data2['results']
        del data2

        data2a = similarity.index.values
        data2b = data >> mask(data.feature.isin(data2a))
        data2a = pd.DataFrame.from_dict({'feature': data2a})
        data2a = data2a >> mask(
            data2a.feature.isin(data2b['feature'].to_numpy()))
        data = data2a >> left_join(data2b, by='feature')
        del data2a, data2b

        data = data.set_index('feature')
        data = data >> select(X.x, X.y, X.z)
        data = data >> arrange(X.z, X.y, X.x)

        return data

    def order_angle_by_channel(mapping,
                               similarity,
                               ranked=ranked,
                               dims=dims,
                               decreasing=False):
        angles = np.arange(360) + 1
        for i in angles:
            if i == 1:
                data_ = create_fmap(mapping, similarity, i, ranked, dims)
                data = [np.max(data_['z'])]
            else:
                data_ = create_fmap(mapping, similarity, i, ranked, dims)
                data.append(np.max(data_['z']))

        data = pd.DataFrame.from_dict({
            'angle': angles,
            'channel': np.array(data)
        })
        data = data >> arrange(X.channel, ascending=decreasing == False)
        return data

    tick += 1
    pb.update(tick)  #2
    np.random.seed(seed_num)
    angle = order_angle_by_channel(mapping, similarity, ranked, dims,
                                   decreasing)
    angle = angle >> mask(X.channel == np.min(angle['channel']))
    angle = angle['angle'].values
    angle = np.random.choice(np.arange(angle.shape[0]).tolist(), 1, False)

    tick += 1
    pb.update(tick)  #3
    fmap = create_fmap(mapping, similarity, angle, ranked, dims)

    fval = value[fmap.index.values].to_numpy()
    fval = pd.DataFrame(fval,
                        index=value.index.values,
                        columns=value.columns.values)

    fboth = fmap >> summarize_each([np.max], X.x, X.y, X.z)
    fboth = fboth.to_numpy()
    data = []
    for i in np.arange(fboth.shape[1]):
        data_ = np.arange(fboth[:, i]) + 1
        data.append(data_.tolist())
        del data_

    fboth = np.meshgrid(data[0], data[1], data[2])
    del data
    fboth = np.array(fboth).T.reshape(-1, 3)
    fboth = pd.DataFrame(fboth, columns=fmap.columns.values)
    fboth = fboth >> arrange(X.z, X.y, X.x)

    fboth = fboth >> left_join(fmap.reset_index(inplace=False),
                               by=['x', 'y', 'z'])

    idx = []
    for i in fboth['feature'].values.tolist():
        idx.append(str(i) != 'nan')

    fval = fval[fboth['feature'][idx]].to_numpy()
    fval = np.matrix.transpose(fval)
    fval = pd.DataFrame(fval,
                        index=fboth['feature'][idx],
                        columns=value.index.values)

    fboth = fboth >> left_join(fval.reset_index(inplace=False), by='feature')

    fboth = fboth >> mutate(x_='x') >> unite(
        'x', ['x_', 'x'], remove=False, sep='')
    fboth = fboth >> select(~X.x_)
    fboth = fboth >> unite('pos_id', ['x', 'y'], remove=True, sep='y')
    fboth = fboth >> unite('pos_id', ['pos_id', 'z'], remove=False, sep='z')
    fboth = fboth >> select(~X.z)

    ori_ontology = ontology

    def str_detect(string, pattern):
        match = []
        for i in string:
            match.append('ONT:' in i)
        return match

    while np.sum(str_detect(ontology['source'], 'ONT:')) > 0:

        data = ontology >> mask(X.relation == 'feature')
        for i in np.arange(ontology.shape[0]):
            if 'ONT:' in ontology['source'][i]:
                data2 = data >> mask(X.target == ontology['source'][i])
                if data2.shape[0] > 0:
                    data_ = pd.DataFrame.from_dict({
                        'source':
                        data2['source'],
                        'target':
                        ontology['target'][i],
                        'similarity':
                        ontology['similarity'][i],
                        'relation':
                        'feature'
                    })
                else:
                    data_ = ontology.iloc[i, :]
            else:
                data_ = ontology.iloc[i, :]

            if i == 0:
                data2 = data_
            else:
                data2 = data.append(data_)
        ontology = data2
    del data_, data, data2

    tick += 1
    pb.update(tick)  #4
    adata = fboth >> select(~X.feature)
    adata = adata.set_index('pos_id')
    adata = adata.fillna(0)

    pdata = value >> mutate(outcome=outcome.astype(int))
    pdata = pdata >> select(X.outcome, fmap.index.values.tolist())

    fdata = fboth >> select(X.pos_id, X.feature)
    fdata2 = ontology >> select(X.source, X.target)
    fdata2 = fdata2.drop_duplicates()
    fdata2 = fdata2 >> separate(X.target, ['t1', 't2'])
    fdata2 = fdata2 >> mutate(t1='ONT') >> unite('target', ['t1', 't2'],
                                                 sep='')
    fdata2 = fdata2 >> mutate(included=1) >> spread(X.target, X.included)
    fdata2 = fdata2 >> rename(feature=X.source)
    fdata = fdata >> left_join(fdata2, by='feature')
    del fdata2
    fdata = fdata.set_index('pos_id')

    tick += 1
    pb.update(tick)  #5
    ontomap = adata.transpose()
    for i in np.arange(ontomap.columns.values.shape[0]):
        dim = re.split('x|y|z', ontomap.columns.values[i])
        if i > 0:
            dim[1] = np.max([int(dim[1]), int(dim_[1])])
            dim[2] = np.max([int(dim[2]), int(dim_[2])])
            dim[3] = np.max([int(dim[3]), int(dim_[3])])
        dim_ = dim
    del dim_
    ontomap = ontomap.to_numpy()
    ontomap = ontomap.reshape(ontomap.shape[0] * ontomap.shape[1])
    ontomap = np.array(ontomap)
    ontomap = ontomap.reshape(adata.shape[1], dim[2], dim[1], dim[3])

    tick += 1
    pb.update(tick)  #6
    ontotype = {}
    for i in np.arange(fdata.shape[1] - 1):
        data0 = fdata >> select(~X.feature)
        data = data0.iloc[:, i]
        data = data.reset_index(inplace=False)
        data = data.rename(columns={data.columns.values[1]: 'ontotype'})
        data = data >> mask(X.ontotype == 1)
        data = data >> left_join(fdata.reset_index(inplace=False),
                                 var='pos_id')
        data = data >> select(X.pos_id, X.feature)

        data2 = data >> separate(X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
        data2 = data2[['feature', 'x', 'y', 'z']].set_index('feature')

        ontotype[data0.columns.values[i]] = data2

    ontotype['root'] = fdata.reset_index(inplace=False) >> select(
        X.pos_id, X.feature)
    ontotype['root'] = ontotype['root'] >> mutate(f_str=X.feature.astype(str))
    ontotype['root'] = ontotype['root'] >> mask(
        X.f_str != 'nan') >> select(~X.f_str)
    ontotype['root'] = ontotype['root'] >> separate(
        X.pos_id, ['a', 'x', 'y', 'z'], sep='x|y|z')
    ontotype['root'] = ontotype['root'][['feature', 'x', 'y',
                                         'z']].set_index('feature')

    data2a = fmap.reset_index(inplace=False)
    data2b = similarity[data2a['feature'].to_numpy().tolist()]
    data2b = data2b.reset_index(inplace=False)
    data2a = data2a >> rename(index=X.feature)
    similarity = data2a >> left_join(data2b, by='index')
    del data2a, data2b
    similarity = similarity >> select(~X.x, ~X.y, ~X.z)
    similarity = similarity.set_index('index')
    similarity.index.name = None

    tick += 1
    pb.update(tick)  #7
    adata.index.name = None
    fdata.index.name = None
    ori_ontology.index = pd.Index(np.arange(ori_ontology.shape[0]))
    output = ExpressionSet(assayData=adata.to_numpy(),
                           phenoData=AnnotatedDataFrame(pdata),
                           featureData=AnnotatedDataFrame(fdata),
                           experimentData=MIAME(
                               other={
                                   'similarity': similarity,
                                   'ontomap': ontomap,
                                   'ontotype': ontotype,
                                   'ontology': ori_ontology
                               }))

    tick += 1
    pb.update(tick)  #8
    return output
Exemplo n.º 12
0
    sns.boxplot(x="d2", y="val", data=dataL2)
    plt.show()
    plt.savefig(saveImg, dpi=600, bbox_inches='tight')

    # 연소득당 거래금액 산점도
    saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
                                    '연소득당 거래금액 산점도')

    makeScatterPlot(dataL2['meanCost'], dataL2['거래금액'], saveImg, 3500, 100000)

    # *******************************************************
    # 데이터 분석 (데이터 분석 기법 활용)
    # *******************************************************
    # 주택 가격 결정 요인을 위한 회귀분석
    dataL4 = ((dataL2 >> dfply.select(dfply.X.건축년도, dfply.X.전용면적, dfply.X.층,
                                      dfply.X.val2, dfply.X.d2, dfply.X.val) >>
               dfply.rename(면적당거래금액=dfply.X.val2, 연소득당거래금액=dfply.X.val)))

    # 주택 가격 결정 요인을 위한 관계성
    saveImg = '{}/{}_{}.png'.format(globalVar['figPath'], serviceName,
                                    '주택 가격 결정 요인을 위한 관계성')

    sns.pairplot(dataL4)
    plt.show()
    plt.savefig(saveImg, dpi=600, bbox_inches='tight')

    # +++++++++++++++++++++++++++++++++++++++++++++++
    # 전체 아파트
    dataL5 = dataL4
    # +++++++++++++++++++++++++++++++++++++++++++++++
    # 모든 변수에 대한 다중선형회귀모형
Exemplo n.º 13
0
    else: # can happen at the year beginning
        iso = "%04d-%02d-%02d" % (yy-1, int(mm), int(dd))
    return iso


sh = list()
for d in os.listdir("data"):
    print(d)
    if os.path.isdir("data/%s" % d):
        f01 = "data/%s/chmi_manualmeasure.txt" % d
        if os.path.isfile(f01):
            d01 = pd.read_csv(f01, sep="|")
            y01 = d01 >> \
                mutate(source = 'chmi_man', country = 'cz', date_valid = d) >> \
                rename(snow = X.snowdepth_total) >> \
                select(X.date_valid, X.source, X.country, X.station, X.snow)
            sh.append(y01)
        f02 = "data/%s/chmi_oah.txt" % d
        if os.path.isfile(f02):
            d02 = pd.read_csv(f02, sep="|")
            y02 = d02 >> \
                mutate(source = 'chmi_oah', country = 'cz')
            y02['date_valid'] = [date_cz2iso(row['date']) for i, row in d02.iterrows()]
            y02 = y02 >> rename(snow = X.snowdepth_total) >> \
                select(X.date_valid, X.source, X.country, X.station, X.snow)
            sh.append(y02)
        f03 = "data/%s/chmi_resorts.txt" % d
        if os.path.isfile(f03):
            d03 = pd.read_csv(f03, sep="|")
            y03 = d03 >> \
                mutate(source = 'chmi_resorts', country = 'cz')
Exemplo n.º 14
0
    def ontology_df(hierarchy, value):
        def linkage_matrix(hierarchy):
            counts = np.zeros(hierarchy.children_.shape[0])
            n_samples = len(hierarchy.labels_)
            for i, merge in enumerate(hierarchy.children_):
                current_count = 0
                for child_idx in merge:
                    if child_idx < n_samples:
                        current_count += 1
                    else:
                        current_count += counts[child_idx - n_samples]
                counts[i] = current_count
            l = [hierarchy.children_, hierarchy.distances_, counts]
            return np.column_stack(l).astype(float)

        labels = value.columns.values[hierarchy.labels_]
        linkage = linkage_matrix(hierarchy)
        tree = dendrogram(linkage)

        A = pd.DataFrame(labels, columns=['A'])
        A = A >> bind_cols(pd.DataFrame(tree['leaves'], columns=['i']))

        B = pd.DataFrame(labels, columns=['B'])
        B = B >> bind_cols(pd.DataFrame(tree['leaves'], columns=['i2']))

        linkages = pd.DataFrame(linkage,
                                columns=['i', 'i2', 'similarity', 'count'])

        ontology = linkages >> left_join(A, by='i')
        ontology = ontology >> left_join(B, by='i2')
        ontology = ontology >> mutate(similarity=1 - X.similarity)
        ontology = ontology >> mutate(
            target=['ONT:' + str(i + 1) for i in range(ontology.shape[0])])

        ontology = ontology >> mutate(i=X.i - ontology.shape[0])
        ontology = ontology >> mutate(i2=X.i2 - ontology.shape[0])

        A = ontology['i'].values.astype(int)
        A1 = ontology['A'].values
        A2 = ['ONT:' + str(i) for i in A]
        ontology = ontology >> mutate(A=np.where(A <= 0, A1, A2))

        B = ontology['i2'].values.astype(int)
        B1 = ontology['B'].values
        B2 = ['ONT:' + str(i) for i in B]
        ontology = ontology >> mutate(B=np.where(B <= 0, B1, B2))

        ontology = pd.melt(ontology,
                           id_vars=['similarity', 'target', 'i', 'i2'],
                           value_vars=['A', 'B'],
                           var_name='key',
                           value_name='source')

        C = np.where(ontology['key'] == 'A', ontology['i'], ontology['i2'])
        C = np.where(C <= 0, 'feature', 'is_a')

        ontology = ontology >> mutate(relation=C)
        ontology = ontology >> select(X.source, X.target, X.similarity,
                                      X.relation)

        ontology = ontology >> arrange(1 - X.similarity, X.relation)

        return ontology
Exemplo n.º 15
0
 df_org = pd.read_csv(file_names[i], header=0)
 for j in np.arange(len(tickers)):
     df = df_org[df_org.SECCODE == tickers[j]]
     df0 = df.query('PRICE!=0')
     df0 = df >> pl.mutate(
         s1=np.where((df.ACTION == 1) & (df.BUYSELL == "S"), df.VOLUME, 0),
         s2=np.where((df.ACTION == 2) & (df.BUYSELL == "S"), df.VOLUME, 0),
         s0=np.where((df.ACTION == 0) & (df.BUYSELL == "S"), df.VOLUME, 0),
         b1=np.where((df.ACTION == 1) & (df.BUYSELL == "B"), df.VOLUME, 0),
         b2=np.where((df.ACTION == 2) & (df.BUYSELL == "B"), df.VOLUME, 0),
         b0=np.where((df.ACTION == 0) & (df.BUYSELL == "B"), df.VOLUME, 0),
         timeb=np.where(
             (df.ACTION == 2) & (df.BUYSELL == "B"), df.NO - 1, 0),
         times=np.where((df.ACTION == 2) &
                        (df.BUYSELL == "S"), df.NO - 1, 0)) >> pl.select([
                            'PRICE', 'ORDERNO', 's1', 's2', 's0', 'b1',
                            'b2', 'b0', 'timeb', 'times'
                        ])
     df0 = df0.groupby(['PRICE', 'ORDERNO']).aggregate({
         's1': np.sum,
         's2': np.sum,
         's0': np.sum,
         'b1': np.sum,
         'b2': np.sum,
         'b0': np.sum,
         'timeb': np.max,
         'times': np.max
     }).reset_index(level=["PRICE", "ORDERNO"])
     pricecum = df.query(
         '(ACTION==2)&(BUYSELL=="S")&(PRICE!=0)').sort_index(
             ascending=False).PRICE.cummax().sort_index()
     ind = pricecum.index.values
import pandas as pd
from pprint import pprint
import dfply as dpy
import nltk
import gensim

from sklearn.model_selection import train_test_split

import timeit

import matplotlib.pyplot as plt

# Import data
reviews = pd.read_csv(
    'C:/Users/straw/Desktop/stageM2/scripts-francesca/reviews.csv'
) >> dpy.select("Restaurant_ID", "Review_ID", "Review_TEXT")

# Only kept the restaurant 'FR0210153861525'
reviews = reviews[reviews['Restaurant_ID'] == 'FR0210153861525'].reset_index(
    level=0, drop=True) >> dpy.drop("Restaurant_ID")

reviews.index = reviews['Review_ID']
reviews = reviews >> dpy.drop("Review_ID")

# Lower case
reviews['Review_TEXT'] = reviews['Review_TEXT'].apply(
    lambda x: " ".join(x.lower() for x in x.split()))
reviews['Review_TEXT'].head()


# Tokenization function