Python read_xml示例，pandas_read_xml.read_xml Python示例

示例#1

0

显示文件

文件： nmr_viewer.py 项目： Rdk0/simple_nmr

def read_peak_list(peaklist_xml: str,
                   threshold_percentile: float = 0.9) -> pd.DataFrame:
    """
    returns peaks above certain percentile
    :param peaklist_xml: file name
    :return: data frame
    """
    df: pd.DataFrame = pdx.read_xml(peaklist_xml,
                                    ["PeakList", "PeakList1D", "Peak1D"])
    df = df.astype({"@F1": "float", "@intensity": float})
    percentile: float = df["@intensity"].quantile(threshold_percentile)
    df = df.rename(columns={"@F1": "ppm", "@intensity": "intensity"})
    return df[df["intensity"].gt(percentile)]

示例#2

0

显示文件

文件： dl_and_agg_the_data.py 项目： Maxofinternet/HATVP_parlementaires_finances

def fetchxmlandread(tdf):
    mostrecent = []
    tdfs = tdf.sort_values(by=['date_depot'], ascending=False)
    for complementurl in tdfs['open_data'] :
        try:
            URL = "https://www.hatvp.fr/livraison/dossiers/" + complementurl
            response = requests.get(URL).text
            df = pdx.read_xml(response, encoding='utf-8')
            depjson = json.loads(df.to_json())
            mostrecent = depjson['declaration']['0']['participationFinanciereDto']['items']
            break
        except:
            continue
    return mostrecent

示例#3

0

显示文件

文件： preprocessing.py 项目： Lleyton-Ariton/firewatch

    def preprocess(self, image_size: Tuple[int, int]=False,
                   automatic_initialization: bool=True, **kwargs) -> None:

        if not os.path.exists(self.get_root_path()):
            _slice_bbox_images(image_source=kwargs.get('image_source', None),
                               annotation_source=kwargs.get('annotations_source', None),
                               sliced_image_dir=kwargs.get('sliced_image_dir', None),
                               sliced_annotation_dir=kwargs.get('sliced_image_dir', None),
                               keep_partial_labels=kwargs.get('keep_partial_labels', True),
                               ignore_empty_files=kwargs.get('ignore_empty_files', False))

        if image_size:
            self.set_image_size(image_size)

        xml_root = f'{self.get_root_path()}/{self.slicer_annotations_dir}'
        xml_files = os.listdir(xml_root)

        images_with_smoke = []
        images_without_smoke = []

        with_counter = 0
        without_counter = 0
        for xml_file in xml_files:
            xml = pdx.read_xml(f'{xml_root}/{xml_file}', transpose=True)
            try:
                xml['annotation']['object']['bndbox']

                path = xml['annotation']['path'] + '.jpeg'
                path = f'{self.get_root_path()}/{self.slicer_images_dir}/{path.split("/")[-1:][0]}'
                images_with_smoke.append(path)

                with_counter += 1
            except KeyError:
                if without_counter < with_counter * self.max_emptytile_inbalance_factor:
                    path = xml['annotation']['path'] + '.jpeg'
                    path = f'{self.get_root_path()}/{self.slicer_images_dir}/{path.split("/")[-1:][0]}'
                    images_without_smoke.append(path)

                    without_counter += 1

        if self.validation_split is not None:
            for images, category in zip([images_with_smoke, images_without_smoke], ['grid_smoke', 'grid_no_smoke']):
                train_images, test_images = train_test_split(images, test_size=self.validation_split,
                                                             shuffle=kwargs.get('shuffle', True))

                for image_set, section in zip([train_images, test_images], ['train', 'validate']):
                    for image in image_set:
                        shutil.move(image, f'{self._target_dir}/{section}/{category}/{image.split("/")[-1:][0]}')

        return None

示例#4

0

显示文件

    def get_excel(self):
        global df

        # import_file_path = filedialog.askopenfilename()
        # df = pd.read_cvs(import_file_path)
        import_file_path = 'C:/Users/ihor.chekh/Documents/GitHub/data_science_learning/BOM&Wire analysis/ELZ_TAB016161D_KSK_L0L_160620.xml'
        root_key_list = ['HarnessContainer', 'Harness']
        df = pdx.read_xml(import_file_path, root_key_list)
        connectors = df['Connectors']['Connector']
        new_df = pd.DataFrame(connectors)
        # new_df['AssemblyPartID'] = connectors['ConnectorAssemblyPartRefs']['@AssemblyPartID']

        # for dict_of_items in connectors:
        #     for k, v in dict_of_items.items():
        #         new_df = new_df.append(new_df, {k: v}, ignore_index=True)

        print(new_df)
        new_df.to_excel(f'{import_file_path}_connectors.xlsx')

示例#5

0

显示文件

文件： get_data.py 项目： weecology/NeonTreeEvaluation_python

def xml_parse(path):
    """Parse a xml annotation and return a pandas df"""

    plot_name = os.path.basename(path)
    plot_name = plot_name.split(".")[0]

    df = pdx.read_xml(path, ["annotation", "object"])
    xmin = df.bndbox.apply(lambda x: x["xmin"])
    xmax = df.bndbox.apply(lambda x: x["xmax"])
    ymin = df.bndbox.apply(lambda x: x["ymin"])
    ymax = df.bndbox.apply(lambda x: x["ymax"])

    result = pd.DataFrame({
        "xmin": xmin,
        "xmax": xmax,
        "ymin": ymin,
        "ymax": ymax,
        "plot_name": plot_name
    })
    return result

示例#6

0

显示文件

def main(args):

    if args is None or len(args) < 2:
        print("Error: An xml file must be given as input")
        sys.exit(1)

    df = pdx.read_xml(sys.argv[1], ['meandata'])

    df = pdx.flatten(df)
    df = df.pipe(pdx.flatten)
    df = df.pipe(pdx.flatten)
    df = df.pipe(pdx.flatten)

    df = df.rename(
        {
            'interval|@begin': 'begin',
            'interval|@end': 'end',
            'interval|edge|@sampledSeconds': 'sampledSeconds',
            'interval|edge|@density': 'density',
            'interval|edge|@laneDensity': 'laneDensity',
            'interval|edge|@speed': 'speed'
        },
        axis=1)

    df['begin'] = df['begin'].astype(float)
    df['end'] = df['end'].astype(float)
    df["sampledSeconds"] = df["sampledSeconds"].astype(float)
    df["density"] = df["density"].astype(float)
    df["laneDensity"] = df["laneDensity"].astype(float)
    df["speed"] = df["speed"].astype(float)
    df = df.replace(np.NaN, 0)
    df['begin'] = df['begin'].astype(int)

    # calculation time interval
    bft = df.begin.iloc[0]
    eft = df.end.iloc[0]
    time_interval = eft - bft
    time_interval = int(time_interval)

    # The end time of the last interval
    _lastsimulationperiod_ = (df.end.iat[-1]).astype(int)

    # creating a list of all end time intervals
    _beginvalues_ = list(
        range(time_interval, _lastsimulationperiod_ + time_interval,
              time_interval))

    # detecting number of segments
    counter1 = Counter(df.begin)
    _seg = counter1[0]

    # calculating total length of network
    length = df['sampledSeconds'] / (df['end'] - df['begin']) / df['density']
    df['Length'] = length.replace(np.NaN, 0).replace(np.inf, 0)
    i = 0
    j = 0
    __net = []
    while _beginvalues_[i] < (_lastsimulationperiod_):

        _net = sum(df.Length.iloc[j:j + _seg])
        __net.append(_net)
        i = i + 1
        j = j + _seg

    # calculating meandensity,meanflow,meanspeed (density=density)
    i = 0
    j = 0
    MD = []
    MS = []
    MF = []
    while _beginvalues_[i] < (_lastsimulationperiod_):

        numofveh = (1 / time_interval) * (sum(
            df.sampledSeconds.iloc[j:j + _seg]))
        speedznumofveh = (1 / time_interval) * (sum(
            df.sampledSeconds.iloc[j:j + _seg] * df.speed.iloc[j:j + _seg]))
        if numofveh > 0:
            meanspeed_ = 3.6 * speedznumofveh / numofveh
        else:
            meanspeed_ = 0
        meandensity_ = (sum(
            df.density.iloc[j:j + _seg] * df.Length.iloc[j:j + _seg])) / _net
        meanflow_ = (sum(
            df.density.iloc[j:j + _seg] * df.Length.iloc[j:j + _seg] *
            df.speed.iloc[j:j + _seg] * 3.6)) / _net
        MD.append(meandensity_)
        MS.append(meanspeed_)
        MF.append(meanflow_)
        i = i + 1
        j = j + _seg

    # plot
    plt.scatter(MD, MS)
    plt.xlabel("Density (Veh/km)")
    plt.ylabel("Speed (Km/hr)")
    plt.show()
    plt.scatter(MD, MF)
    plt.xlabel("Density (Veh/km)")
    plt.ylabel("Flow (Veh/hr)")
    plt.show()
    plt.scatter(MS, MF)
    plt.xlabel("Speed (Km/hr)")
    plt.ylabel("Flow (Veh/hr)")
    plt.show()

    # calculating meandensity,meanflow,meanspeed (density=laneDensity)
    i = 0
    j = 0
    lMD = []
    lMS = []
    lMF = []
    while _beginvalues_[i] <= (_lastsimulationperiod_ - time_interval):

        numofveh = (1 / time_interval) * (sum(
            df.sampledSeconds.iloc[j:j + _seg]))
        speedznumofveh = (1 / time_interval) * (sum(
            df.sampledSeconds.iloc[j:j + _seg] * df.speed.iloc[j:j + _seg]))
        if numofveh > 0:
            meanspeed_ = 3.6 * speedznumofveh / numofveh
        else:
            meanspeed_ = 0
        meandensity_ = (sum(df.laneDensity.iloc[j:j + _seg] *
                            df.Length.iloc[j:j + _seg])) / _net
        meanflow_ = (sum(
            df.laneDensity.iloc[j:j + _seg] * df.Length.iloc[j:j + _seg] *
            df.speed.iloc[j:j + _seg] * 3.6)) / _net
        lMD.append(meandensity_)
        lMS.append(meanspeed_)
        lMF.append(meanflow_)
        i = i + 1
        j = j + _seg

    # plot
    plt.scatter(lMD, lMS)
    plt.xlabel("Density (Veh/km)")
    plt.ylabel("Speed (Km/hr)")
    plt.show()
    plt.scatter(lMD, lMF)
    plt.xlabel("Density (Veh/km)")
    plt.ylabel("Flow (Veh/hr)")
    plt.show()
    plt.scatter(lMS, lMF)
    plt.xlabel("Speed (Km/hr)")
    plt.ylabel("Flow (Veh/hr)")
    plt.show()

    # Build a csv file
    Macro_Features = {
        'Density': MD,
        'Speed': MS,
        'Flow': MF,
    }
    df = pd.DataFrame(Macro_Features, columns=['Density', 'Speed', 'Flow'])

    df.to_csv('Macro_density.csv')

    # Build a csv file
    Macro_Features = {
        'Density': lMD,
        'Speed': lMS,
        'Flow': lMF,
    }
    df = pd.DataFrame(Macro_Features, columns=['Density', 'Speed', 'Flow'])

    df.to_csv('Macro_lanedensity.csv')

示例#7

0

显示文件

    def load_prepare_data(self):
        """
        Prepare dataframe with useful information from event xml

        Parameters
        ----------
        xml_name : str
            Name of seiscomp like event xml file

        Returns
        -------
        pandas.Dataframe
            Dataframe with useful information about events in the xml
        """
        # loading events dataframe
        df = pdx.read_xml(self.xml_path,
                          ['seiscomp', 'EventParameters', 'event'])
        # loading origins dataframe
        df_or = pdx.read_xml(self.xml_path,
                             ['seiscomp', 'EventParameters', 'origin'])

        # preparing events dataframe
        df['region'] = df['description'].apply(self.get_region_author,
                                               key='text')
        df['author'] = df['creationInfo'].apply(self.get_region_author,
                                                key='author')
        df = df[['@publicID', 'preferredOriginID', 'author', 'region']]

        # preparing origins dataframe
        df_or['lat'] = df_or['latitude'].apply(self.get_from_dict, key='value')
        df_or['lon'] = df_or['longitude'].apply(self.get_from_dict,
                                                key='value')
        df_or['z'] = df_or['depth'].apply(self.get_from_dict, key='value')
        df_or['orig_time'] = df_or['time'].apply(self.get_from_dict,
                                                 key='value',
                                                 time=True)
        df_or['orig_time'] = pd.to_datetime(df_or['orig_time'])

        df_or['lat_e'] = df_or['latitude'].apply(self.get_from_dict,
                                                 key='uncertainty')
        df_or['lon_e'] = df_or['longitude'].apply(self.get_from_dict,
                                                  key='uncertainty')
        df_or['z_e'] = df_or['depth'].apply(self.get_from_dict,
                                            key='uncertainty')
        df_or['t_e'] = df_or['quality'].apply(self.get_from_dict,
                                              key='standardError')

        df_or['min_dis'] = df_or['quality'].apply(self.get_from_dict,
                                                  key='minimumDistance')
        df_or['phasecount'] = df_or['quality'].apply(self.get_from_dict,
                                                     key='usedPhaseCount')
        df_or['stationcount'] = df_or['quality'].apply(self.get_from_dict,
                                                       key='usedStationCount')

        df_or['mag'] = df_or['magnitude'].apply(self.get_mag, key='value')
        df_or['mag_e'] = df_or['magnitude'].apply(self.get_mag,
                                                  key='uncertainty')
        df_or['mag_count'] = df_or['magnitude'].apply(self.get_mag_count)

        df_or = df_or[[
            '@publicID', 'orig_time', 'mag', 'lat', 'lon', 'z', 'lat_e',
            'lon_e', 'z_e', 'min_dis', 'phasecount', 'stationcount', 'mag_e',
            't_e'
        ]]
        df_or.rename(columns={'@publicID': 'originID'}, inplace=True)

        # mergin the two dataframes
        df_merge = pd.merge(df,
                            df_or,
                            how='inner',
                            left_on='preferredOriginID',
                            right_on='originID')

        n = len(df_merge)
        print(f'\nGuarndando eventos ene el archivo {self.csv_path}\n')
        print(f'\n\n\tSe encontraron un total de {n} eventos\n')
        df_merge.to_csv(self.csv_path)

示例#8

0

显示文件

文件： xml2json.py 项目： HunterGhost27/Stats-Configurator

#   I hate XML.

#   =========
#   LIBRARIES
#   =========

import pandas  # For fancy dataframes.
import pandas_read_xml as pdx  # To convert XML to fancy dataframes.
import json  # For JSON

#   =============
#   READ XML FILE
#   =============

statObjectDefsDataFrame = pdx.read_xml(
    "xml/StatObjectDefinitions.xml",
    ["root", "stat_object_definitions", "stat_object_definition"])

#   ===================
#   WRITE JSON CONTENTS
#   ===================

#   Stat Object Definitions
#   -----------------------

jsonContent = {}
maps = {}
for index, content in statObjectDefsDataFrame.iterrows():

    #   Extract data as data-frame
    fieldDefsDataFrame = pandas.DataFrame.from_dict(

示例#9

0

显示文件

from ast import literal_eval
import glob
import pandas_read_xml as pdx
from tqdm import tqdm
import base64

xml_normal = glob.glob('./data/train/normal/*.xml')
xml_arr = glob.glob('./data/train/arrhythmia/*.xml')
xml_train = xml_normal + xml_arr

train_array = np.empty((0,8,1500))

#train_array
for xml in tqdm(xml_train[24369:]):
    
    Waveform = pdx.read_xml(xml, ['RestingECG', 'Waveform'])
    Rhythm = Waveform[1]
    df_rhythm = pd.json_normalize(Rhythm)    
    rhythm_Lead = pd.json_normalize(df_rhythm['LeadData'])
    df_RL = pd.DataFrame()
    
    for i in range(len(rhythm_Lead.columns)):
        df_normal = pd.json_normalize(rhythm_Lead[i])
        df_RL = df_RL.append(df_normal)  
    #rhythm_Lead = rhythm_Lead.drop([2,3,4,5])
        
    df_rhythm_sum = pd.concat([df_rhythm.iloc[:,:-1],df_RL],1)
    df_rhythm_sum = df_RL
    df_rhythm_sum.reset_index(drop=True,inplace=True)
    x = np.array(0)
    rhythm_dataset = np.empty((0,1500))

示例#10

0

显示文件

文件： import_corp.py 项目： sbabicki/ice-cream

    # unzip folder
    zip = zipfile.ZipFile(save_path)
    zip.extractall(save_folder)


# import list of xml files
xml_files = []
for name in glob.glob(save_folder + "/OPEN_DATA_*.xml"):
    #xml_files.append(pdx.read_xml(name,  ["cc:corpcan", "corporations"], encoding="utf8"))# import data from federal
    xml_files.append(name)

# create df of corporate names and ids and write to .csv
df_names_ids = pd.DataFrame(columns=["corp_id", "businessNumber"])
for file in xml_files:
    print(file)
    corp_canada_df = pdx.read_xml(file, ["cc:corpcan", "corporations"],
                                  encoding="utf8")
    corp_id = []
    businessNumber = []
    for i in range(corp_canada_df.size):
        #if "businessNumbers" in corp_canada_df["corporation"][i].keys():
        try:
            bn = corp_canada_df["corporation"][i]["businessNumbers"][
                "businessNumber"]
            businessNumber.append(bn)
            corp_id.append(corp_canada_df["corporation"][i]["@corporationId"])
        except:
            businessNumber.append(None)
            corp_id.append(corp_canada_df["corporation"][i]["@corporationId"])

    df_file = pd.DataFrame({
        "corp_id": corp_id,

示例#11

0

显示文件

文件： COVID19-Graph-Generator.py 项目： amicus-veritatis/Namu-graph-generator




class req:
    
    def __init__(self,PageNo=0, Rows=500,DateFrom=20200310,DateTo=0):
        DateTo = time.strftime('%Y%m%d',time.localtime(time.time()))
        query = {'serviceKey': ServiceKey, 'PageNo':PageNo,'numOfRows':Rows,'startCreateDt':DateFrom,'endCreateDt':DateTo}
        self.resp = get(url=url,params=query)
        

a = req(PageNo=1, DateFrom=20200311)
response = a.resp.text
df0 = pd.read_csv("https://pastebin.com/raw/FFzk1m53")
df = pdx.read_xml(response,['response','body','items','item'])
df = df.drop_duplicates('stateDt', keep='first')
now = dt.datetime.now()
print(now) #########################
print(dt.datetime.hour) ################
print(now.hour) ################
nHour = (now).strftime("%Y-%m-%d") if now.hour<10 else (now+dt.timedelta(days=1)).strftime("%Y-%m-%d")
nTime = now-dt.timedelta(days=1) if now.hour<10 else now
sp = np.datetime_as_string(np.arange('2020-03-11',nHour ,dtype='datetime64[D]'),unit='D')
#################
sp = np.flip(sp) ###############
df['stateDt'] = sp
df = df.iloc[::-1]
df = pd.concat([df0,df],ignore_index=True)
df = df.drop_duplicates('stateDt', keep='first')
print(sp)

示例#12

0

显示文件

 def raw(self):
     """
     Raw data from XML file
     """
     raw = pdx.read_xml(self.url, ['proposicao', 'Votacoes', 'Votacao'])
     return raw

示例#13

0

显示文件

import pandas_read_xml as pdx
df = pdx.read_xml("data.xml")
print(df.to_json())

示例#14

0

显示文件

文件： main.py 项目： Hong-Ci-Yuan/pythonProject

def get_dataframe(file_name):
    df = pdx.read_xml('D:\PycharmProjects\pythonProject\config\\' + file_name + '.xml', ['lvr_land', '買賣'],
                      encoding='UTF-8')
    return df

示例#15

0

显示文件

 def find_pandas(xbrl_str):
     print("Hi")
     test_zip_path = xbrl_str
     root_key_list = ['F_000151']
     df = pdx.read_xml(test_zip_path, root_key_list, transpose=True)
     print(df)

示例#16

0

显示文件

文件： views.py 项目： emailtovamos/PG2

from django.shortcuts import render
import spacy
import xml.etree.ElementTree as ET
import pandas_read_xml as pdx

nlp = spacy.load("en_core_web_trf")
print("nlp loaded")
url = 'http://www.orphadata.org/data/xml/en_product4.xml'
df = pdx.read_xml(url, ['JDBOR', 'HPODisorderSetStatusList'],
                  encoding='cp1252')
print("xml read")


# Create your views here.
def index(request):
    return render(request, 'index.html')


def result(request):
    testquery = str(request.POST.get('testquery', False))

    nouns = getNouns(testquery)
    result = conditionsFromMedicalTerms(nouns)

    return render(request, 'result.html', {'result': result})


def getNouns(sentence):
    nlpSentence = nlp(sentence)
    nouns = []
    for token in nlpSentence:

示例#17

0

显示文件

文件： auto_flatten_test.py 项目： ihor43x/data_science_learning

import pandas as pd
import pandas_read_xml as pdx
from pandas_read_xml import auto_separate_tables

file_path = 'data_science_learning/BOM&Wire analysis/ELZ_TAB016161D_KSK_L0L_160620.xml'
root_key_list = ['HarnessContainer', 'Harness', 'Connectors']
key_columns = ['Connector', 'ConnectorModuleRefs|ConnectorModuleRef']

df = pdx.read_xml(file_path, root_key_list)
data = df.pipe(auto_separate_tables, key_columns)
# data.to_excel('data_science_learning/pandas_read_xml/apc200219_new.xlsx')
print(data.keys())

示例#18

0

显示文件

        pred_speaker = "Biden"
    elif pred_speaker[0] == 1:
        pred_speaker = "Trump"
    st.write(
        f"With >80% accuracy, we predict that the message was said by {pred_speaker}."
    )


# Prevents running errors before user has entered data file URL
if (xml_path == ""):
    st.write("No data entered.")
    tb_main()
    st.stop()

try:
    df_sms = pdx.read_xml("https://drive.google.com/uc?id=" + xml_path,
                          encoding="utf8")
    name, df = add_features(df_sms)
except:
    df_sms = pdx.read_xml(xml_path, encoding="utf8")
    name, df = add_features(df_sms)

df['person'] = df['person'].astype(int)
first = df['person'][0]
countt = [0, 0]
who_spoke = [[], []]
for i in range(len(df['person'])):
    if df['person'][i] == first:
        countt[0] += 1
        who_spoke[0].append(i)
    else:
        countt[1] += 1

示例#19

0

显示文件

文件： imageLabelCropXml.py 项目： ademkesim/PythonScripts

images_path = "C:\\Users\\hsnyt\\Desktop\\trafic\\azami hız 30"
labels_path = "C:\\Users\\hsnyt\\Desktop\\trafic\\txtdosya"
#kırpılmış dosyaların olacağı klasör
new_image_path = 'C:\\Users\\hsnyt\\Desktop\\hazır\\'

(klasorler, dosyalar) = ayristir(labels_path)

# github repodaki 0,1 formatında hazırlandı
label_names = [
    "Sol", "Sag", "IleriSol", "IleriSag", "SolaDonulmez", "SagaDonulmez",
    "Girilmez", "TrafikKapalı", "Dur", "ParkYapılmaz", "ParkYeri",
    "HizSiniri30", "HizSiniri40", "HizSiniri20Bitti", "Durak"
]
for i in dosyalar:

    df = pdx.read_xml(os.path.join(labels_path, i))

    image_name = df.loc['object'].loc['filename']

    image_number = 0

    xmin = int(df.loc['object'].loc['annotation'][0]['bndbox']['xmin'])
    ymin = int(df.loc['object'].loc['annotation'][0]['bndbox']['ymin'])
    xmax = int(df.loc['object'].loc['annotation'][0]['bndbox']['xmax'])
    ymax = int(df.loc['object'].loc['annotation'][0]['bndbox']['ymax'])

    crop_box = (xmin, ymin, xmax, ymax)

    print(crop_box)
    im = im.crop(crop_box)

示例#20

0

显示文件

def generatOneRow(filePath: str) -> pd.DataFrame:
    # Create a dataframe from a xml file
    df_singleRowXml = pdx.read_xml(filePath)
    df_singleRowXml = pdx.auto_flatten(df_singleRowXml).dropna(axis=1)
    df_singleRowXml = cleanOneRow(df_singleRowXml)
    return df_singleRowXml