Пример #1
0
def test_file_path_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    expect = pdx.read_avro(tf.name)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    assert_frame_equal(expect, dataframe)
Пример #2
0
def test_buffer_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    with open(tf.name, 'rb') as f:
        expect = pdx.read_avro(BytesIO(f.read()))
        expect['DateTime64'] = expect['DateTime64'].astype(
            np.dtype('datetime64[ns]'))
    assert_frame_equal(expect, dataframe)
Пример #3
0
def main():
    df = pd.DataFrame({"Boolean": [True, False, True, False],
                       "Float64": np.random.randn(4),
                       "Int64": np.random.randint(0, 10, 4),
                       "String": ['foo', 'bar', 'foo', 'bar'],
                       "DateTime64": [pd.Timestamp('20190101'), pd.Timestamp('20190102'),
                                      pd.Timestamp('20190103'), pd.Timestamp('20190104')]})

    pdx.to_avro(OUTPUT_PATH, df)
    saved = pdx.read_avro(OUTPUT_PATH)
    print(saved)
Пример #4
0
def test_dataframe_kwargs(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    # include columns
    columns = ['Boolean', 'Int64']
    expect = pdx.read_avro(tf.name, columns=columns)
    df = dataframe[columns]
    assert_frame_equal(expect, df)
    # exclude columns
    columns = ['String', 'Boolean']
    expect = pdx.read_avro(tf.name, exclude=columns)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    df = dataframe.drop(columns, axis=1)
    assert_frame_equal(expect, df)
    # specify index
    index = 'String'
    expect = pdx.read_avro(tf.name, index=index)
    expect['DateTime64'] = expect['DateTime64'].astype(
        np.dtype('datetime64[ns]'))
    df = dataframe.set_index(index)
    assert_frame_equal(expect, df)
Пример #5
0
def combine_files(json_filepath, arvo_filepath, csv_filepath, output_filepath):
    """Combines the three files, eliminates duplicates and it sorts the resulting dataset by City Name. Creates
    a csv file in output_filepath and returns a dataframe with its content
    """
    #reading all the three files
    df = pd.read_json(json_filepath)
    df = df.append(pd.read_csv(csv_filepath))
    df = df.append(pdx.read_avro(arvo_filepath))
    #dropping duplicates
    df = df.drop_duplicates()
    #sorting by Name
    df = df.sort_values(by='Name')
    #writing to csv
    df.to_csv(output_filepath)
    return pd.read_csv(output_filepath)
Пример #6
0
def converter_csv_to_avro(INPUT_PATH, OUTPUT_PATH, converter_to_datetime):
    df = pd.read_csv(INPUT_PATH)

    # Trasnform columns string to datetime
    for columns_to_converter in converter_to_datetime:
        df[columns_to_converter] = pd.to_datetime(df[columns_to_converter])

    print(df.info())

    pdx.to_avro(OUTPUT_PATH, df)  # Converter
    saved = pdx.read_avro(OUTPUT_PATH)  # Only read to control

    print(saved)

    return
Пример #7
0
def _read_avro(tmpfile, remove_timezone_from_type=True, *args, **kwargs):
    """
    Reads a DataFrame from an Avro file

    Args:
        tmpfile (tempfile.NamedTemporaryFile):
            Connection to the file to be read from
    Returns:
        pd.DataFrame: The DataFrame read from Avro
    """

    # Pandavro reading from a tempfile if it hasn't been closed post-writing
    # raises an 'ValueError', so we have to create a secondary opening.
    # Will work on unix-like systems, but not Windows.
    with open(tmpfile.name, 'rb') as f:
        df = pdx.read_avro(f, *args, **kwargs)

    if remove_timezone_from_type:
        datetime_cols = df.columns[df.dtypes == 'datetime64[ns, UTC]']
        df[datetime_cols] = df[datetime_cols].apply(
            lambda x: x.dt.tz_convert(None))
    return df
Пример #8
0
def test_file_path_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    expect = pdx.read_avro(tf.name)
    assert_frame_equal(expect, dataframe)
Пример #9
0
def test_buffer_e2e(dataframe):
    tf = NamedTemporaryFile()
    pdx.to_avro(tf.name, dataframe)
    with open(tf.name, 'rb') as f:
        expect = pdx.read_avro(BytesIO(f.read()))
    assert_frame_equal(expect, dataframe)
Пример #10
0
    "Boolean": [True, False, True, False],
    "Float64":
    np.random.randn(4),
    "Int64":
    np.random.randint(0, 10, 4),
    "String": ['foo', 'bar', 'foo', 'bar'],
    "DateTime64": [
        pd.Timestamp('20190101'),
        pd.Timestamp('20190102'),
        pd.Timestamp('20190103'),
        pd.Timestamp('20190104')
    ]
})

pdx.to_avro("SampleAvro2.avro", df)
saved = pdx.read_avro("SampleAvro.avro")
print(saved)

vv = pd.read_csv("Sample2.csv", header=0)
bb = pd.read_csv("Sample.tsv", sep='\t', header=None)
aa = pd.read_excel('Sample.xls')
aa2 = pd.read_excel('Sample.xlsx')
aa.columns = ['a', 'b', 'c', 'd', 'e', 'f']
'''
FileName = '/home/contactrkk_gmail/1D311A1E02824594/AllKindOfStuff/ML/Salaries.csv'
FileType = FileName.split(".")
FileType = FileType[len(FileType)-1].lower()
observations = pd.read_csv('Salaries.csv')
'''

from urllib.request import urlopen
Пример #11
0
TEMP_FILES = os.listdir("TrainTemp/")

NUM_EPOCHS = 1
BATCH_SIZE = 1000000
PARAMS = {
    "user_feats_size": 8222243,
    "item_feats_size": 343419,
    "embed_size": 150,
    "optimizer_type": "Adam",
    "learning_rate": 0.05,
    "l2_reg": 0.0001}


if __name__ == "__main__":
    # load data
    data = pd.concat([pdx.read_avro("TrainTemp/" + f) for f in TEMP_FILES])
    data = np.hstack((
        data["user_index"].values[:, None],
        data["item_indexs"].values[:, None]))
    _create_training_data(data)

    config = tf.estimator.RunConfig().replace(
        session_config=tf.ConfigProto(
            device_count={"GPU": 1},
            log_device_placement=True),
        log_step_count_steps=100,
        save_summary_steps=10,
        keep_checkpoint_max=3)

    BPR = tf.estimator.Estimator(
        model_fn=model_fn, model_dir=MODEL_DIR, params=PARAMS, config=config)
Пример #12
0
 def read(self, key: str) -> pd.DataFrame:
     df = pandavro.read_avro(key)
     return df
Пример #13
0
import os
import pandavro

df = pandavro.read_avro("./data/csv/example.avro")
print(df)
Пример #14
0
    def LoadData(self, FileName, HeaderMissing="No"):
        # Supports excel,csv,tsv,xml,json,orc,parquet,avro
        import pandas as pd
        FileType = FileName.split(".")
        FileType = FileType[len(FileType) - 1].lower()
        if FileType == 'xls':
            if HeaderMissing == "Yes":
                return pd.read_excel(FileName, header=None)
            else:
                return pd.read_excel(FileName)
        if FileType == 'xlsx':
            if HeaderMissing == "Yes":
                return pd.read_excel(FileName, header=None)
            else:
                return pd.read_excel(FileName)
        if FileType == 'csv':
            if HeaderMissing == "Yes":
                return pd.read_csv(FileName, header=None)
            else:
                return pd.read_csv(FileName)
        if FileType == 'tsv':
            if HeaderMissing == "Yes":
                return pd.read_csv(FileName, header=None, sep='\t')
            else:
                return pd.read_csv(FileName, sep='\t')
        if FileType == 'orc':
            import pyarrow.orc as orc
            return orc.ORCFile(FileName).read().to_pandas()
        if FileType == 'parquet':
            import pyarrow.parquet as parquet
            return parquet.ParquetFile(FileName).read().to_pandas()
        if FileType == 'avro':
            import pandavro as pdx
            return pdx.read_avro(FileName)
        if FileType == 'json':
            import json
            from flatten_json import flatten
            from pandas.io.json import json_normalize
            with open(FileName) as RequiredFile:
                json = json.load(RequiredFile)
            if isinstance(json, dict):
                if (len(json) > 1):
                    DataFrame = json_normalize(flatten(json))
                else:
                    DataFrame = json_normalize(list(json.values())[0])
            else:
                FlattenedData = (flatten(_json) for _json in json)
                DataFrame = pd.DataFrame(FlattenedData)
            return DataFrame
        if FileType == 'xml':
            import xml.etree.ElementTree as et
            RootElement = et.parse(FileName).getroot()
            RootElementTag = RootElement.tag
            RootElementAttributes = []

            for Item in RootElement.keys():
                if "__" + RootElementTag + "___" + Item not in RootElementAttributes:
                    RootElementAttributes.append("__" + RootElementTag +
                                                 "___" + Item)

            CoreElement = []
            CoreElementAttributes = []
            CoreNodes = []
            CoreNodesAttributes = []
            FinalColumns = []

            for CE in RootElement:
                if CE.tag not in CoreElement:
                    CoreElement.append(CE.tag)
                for Item in CE.keys():
                    if CE.tag + "___" + Item not in CoreElementAttributes:
                        CoreElementAttributes.append(CE.tag + "___" + Item)
                for Item in list(CE):
                    if CE.tag + "__" + Item.tag not in CoreNodes:
                        CoreNodes.append(CE.tag + "__" + Item.tag)
                    for Item_ in Item.keys():
                        if CE.tag + "__" + Item.tag + "___" + Item_ not in CoreNodesAttributes:
                            CoreNodesAttributes.append(CE.tag + "__" +
                                                       Item.tag + "___" +
                                                       Item_)

            RootElementAttributes = sorted(RootElementAttributes)
            CoreElement = sorted(CoreElement)
            CoreElementAttributes = sorted(CoreElementAttributes)
            CoreNodes = sorted(CoreNodes)
            CoreNodesAttributes = sorted(CoreNodesAttributes)
            FinalColumns = FinalColumns + RootElementAttributes + CoreElementAttributes + CoreNodes + CoreNodesAttributes
            FinalColumns = sorted(FinalColumns)
            DataFrame = pd.DataFrame(columns=FinalColumns)

            for CE in RootElement:
                DataRow = []
                for Item in RootElementAttributes:
                    DataRow.append(RootElement.attrib.get(
                        Item.split("___")[1]))
                for Item in CoreElementAttributes:
                    DataRow.append(CE.attrib.get(Item.split("___")[1]))
                for Item in CoreNodes:
                    if CE is not None and CE.find(
                            Item.split("__")[1]) is not None:
                        DataRow.append(CE.find(Item.split("__")[1]).text)
                    else:
                        DataRow.append(None)
                    CoreNodesAttributesFiltered = [
                        Value for Value in CoreNodesAttributes
                        if Value.split("___")[0] == Item
                    ]
                    for CNAF in CoreNodesAttributesFiltered:
                        DataRow.append(
                            CE.find(Item.split("__")[1]).attrib.get(
                                CNAF.split("___")[1]))
                        #print(CE.find(Item.split("__")[1]).attrib)
                        #print("**********")
                    #print(CoreNodesAttributesFiltered)
                    #print("----------------")
                #print(DataRow)
                DataFrame = DataFrame.append(pd.Series(DataRow,
                                                       index=FinalColumns),
                                             ignore_index=True)
            return DataFrame