def read_peak_list(peaklist_xml: str, threshold_percentile: float = 0.9) -> pd.DataFrame: """ returns peaks above certain percentile :param peaklist_xml: file name :return: data frame """ df: pd.DataFrame = pdx.read_xml(peaklist_xml, ["PeakList", "PeakList1D", "Peak1D"]) df = df.astype({"@F1": "float", "@intensity": float}) percentile: float = df["@intensity"].quantile(threshold_percentile) df = df.rename(columns={"@F1": "ppm", "@intensity": "intensity"}) return df[df["intensity"].gt(percentile)]
def fetchxmlandread(tdf): mostrecent = [] tdfs = tdf.sort_values(by=['date_depot'], ascending=False) for complementurl in tdfs['open_data'] : try: URL = "https://www.hatvp.fr/livraison/dossiers/" + complementurl response = requests.get(URL).text df = pdx.read_xml(response, encoding='utf-8') depjson = json.loads(df.to_json()) mostrecent = depjson['declaration']['0']['participationFinanciereDto']['items'] break except: continue return mostrecent
def preprocess(self, image_size: Tuple[int, int]=False, automatic_initialization: bool=True, **kwargs) -> None: if not os.path.exists(self.get_root_path()): _slice_bbox_images(image_source=kwargs.get('image_source', None), annotation_source=kwargs.get('annotations_source', None), sliced_image_dir=kwargs.get('sliced_image_dir', None), sliced_annotation_dir=kwargs.get('sliced_image_dir', None), keep_partial_labels=kwargs.get('keep_partial_labels', True), ignore_empty_files=kwargs.get('ignore_empty_files', False)) if image_size: self.set_image_size(image_size) xml_root = f'{self.get_root_path()}/{self.slicer_annotations_dir}' xml_files = os.listdir(xml_root) images_with_smoke = [] images_without_smoke = [] with_counter = 0 without_counter = 0 for xml_file in xml_files: xml = pdx.read_xml(f'{xml_root}/{xml_file}', transpose=True) try: xml['annotation']['object']['bndbox'] path = xml['annotation']['path'] + '.jpeg' path = f'{self.get_root_path()}/{self.slicer_images_dir}/{path.split("/")[-1:][0]}' images_with_smoke.append(path) with_counter += 1 except KeyError: if without_counter < with_counter * self.max_emptytile_inbalance_factor: path = xml['annotation']['path'] + '.jpeg' path = f'{self.get_root_path()}/{self.slicer_images_dir}/{path.split("/")[-1:][0]}' images_without_smoke.append(path) without_counter += 1 if self.validation_split is not None: for images, category in zip([images_with_smoke, images_without_smoke], ['grid_smoke', 'grid_no_smoke']): train_images, test_images = train_test_split(images, test_size=self.validation_split, shuffle=kwargs.get('shuffle', True)) for image_set, section in zip([train_images, test_images], ['train', 'validate']): for image in image_set: shutil.move(image, f'{self._target_dir}/{section}/{category}/{image.split("/")[-1:][0]}') return None
def get_excel(self): global df # import_file_path = filedialog.askopenfilename() # df = pd.read_cvs(import_file_path) import_file_path = 'C:/Users/ihor.chekh/Documents/GitHub/data_science_learning/BOM&Wire analysis/ELZ_TAB016161D_KSK_L0L_160620.xml' root_key_list = ['HarnessContainer', 'Harness'] df = pdx.read_xml(import_file_path, root_key_list) connectors = df['Connectors']['Connector'] new_df = pd.DataFrame(connectors) # new_df['AssemblyPartID'] = connectors['ConnectorAssemblyPartRefs']['@AssemblyPartID'] # for dict_of_items in connectors: # for k, v in dict_of_items.items(): # new_df = new_df.append(new_df, {k: v}, ignore_index=True) print(new_df) new_df.to_excel(f'{import_file_path}_connectors.xlsx')
def xml_parse(path): """Parse a xml annotation and return a pandas df""" plot_name = os.path.basename(path) plot_name = plot_name.split(".")[0] df = pdx.read_xml(path, ["annotation", "object"]) xmin = df.bndbox.apply(lambda x: x["xmin"]) xmax = df.bndbox.apply(lambda x: x["xmax"]) ymin = df.bndbox.apply(lambda x: x["ymin"]) ymax = df.bndbox.apply(lambda x: x["ymax"]) result = pd.DataFrame({ "xmin": xmin, "xmax": xmax, "ymin": ymin, "ymax": ymax, "plot_name": plot_name }) return result
def main(args): if args is None or len(args) < 2: print("Error: An xml file must be given as input") sys.exit(1) df = pdx.read_xml(sys.argv[1], ['meandata']) df = pdx.flatten(df) df = df.pipe(pdx.flatten) df = df.pipe(pdx.flatten) df = df.pipe(pdx.flatten) df = df.rename( { 'interval|@begin': 'begin', 'interval|@end': 'end', 'interval|edge|@sampledSeconds': 'sampledSeconds', 'interval|edge|@density': 'density', 'interval|edge|@laneDensity': 'laneDensity', 'interval|edge|@speed': 'speed' }, axis=1) df['begin'] = df['begin'].astype(float) df['end'] = df['end'].astype(float) df["sampledSeconds"] = df["sampledSeconds"].astype(float) df["density"] = df["density"].astype(float) df["laneDensity"] = df["laneDensity"].astype(float) df["speed"] = df["speed"].astype(float) df = df.replace(np.NaN, 0) df['begin'] = df['begin'].astype(int) # calculation time interval bft = df.begin.iloc[0] eft = df.end.iloc[0] time_interval = eft - bft time_interval = int(time_interval) # The end time of the last interval _lastsimulationperiod_ = (df.end.iat[-1]).astype(int) # creating a list of all end time intervals _beginvalues_ = list( range(time_interval, _lastsimulationperiod_ + time_interval, time_interval)) # detecting number of segments counter1 = Counter(df.begin) _seg = counter1[0] # calculating total length of network length = df['sampledSeconds'] / (df['end'] - df['begin']) / df['density'] df['Length'] = length.replace(np.NaN, 0).replace(np.inf, 0) i = 0 j = 0 __net = [] while _beginvalues_[i] < (_lastsimulationperiod_): _net = sum(df.Length.iloc[j:j + _seg]) __net.append(_net) i = i + 1 j = j + _seg # calculating meandensity,meanflow,meanspeed (density=density) i = 0 j = 0 MD = [] MS = [] MF = [] while _beginvalues_[i] < (_lastsimulationperiod_): numofveh = (1 / time_interval) * (sum( df.sampledSeconds.iloc[j:j + _seg])) speedznumofveh = (1 / time_interval) * (sum( df.sampledSeconds.iloc[j:j + _seg] * df.speed.iloc[j:j + _seg])) if numofveh > 0: meanspeed_ = 3.6 * speedznumofveh / numofveh else: meanspeed_ = 0 meandensity_ = (sum( df.density.iloc[j:j + _seg] * df.Length.iloc[j:j + _seg])) / _net meanflow_ = (sum( df.density.iloc[j:j + _seg] * df.Length.iloc[j:j + _seg] * df.speed.iloc[j:j + _seg] * 3.6)) / _net MD.append(meandensity_) MS.append(meanspeed_) MF.append(meanflow_) i = i + 1 j = j + _seg # plot plt.scatter(MD, MS) plt.xlabel("Density (Veh/km)") plt.ylabel("Speed (Km/hr)") plt.show() plt.scatter(MD, MF) plt.xlabel("Density (Veh/km)") plt.ylabel("Flow (Veh/hr)") plt.show() plt.scatter(MS, MF) plt.xlabel("Speed (Km/hr)") plt.ylabel("Flow (Veh/hr)") plt.show() # calculating meandensity,meanflow,meanspeed (density=laneDensity) i = 0 j = 0 lMD = [] lMS = [] lMF = [] while _beginvalues_[i] <= (_lastsimulationperiod_ - time_interval): numofveh = (1 / time_interval) * (sum( df.sampledSeconds.iloc[j:j + _seg])) speedznumofveh = (1 / time_interval) * (sum( df.sampledSeconds.iloc[j:j + _seg] * df.speed.iloc[j:j + _seg])) if numofveh > 0: meanspeed_ = 3.6 * speedznumofveh / numofveh else: meanspeed_ = 0 meandensity_ = (sum(df.laneDensity.iloc[j:j + _seg] * df.Length.iloc[j:j + _seg])) / _net meanflow_ = (sum( df.laneDensity.iloc[j:j + _seg] * df.Length.iloc[j:j + _seg] * df.speed.iloc[j:j + _seg] * 3.6)) / _net lMD.append(meandensity_) lMS.append(meanspeed_) lMF.append(meanflow_) i = i + 1 j = j + _seg # plot plt.scatter(lMD, lMS) plt.xlabel("Density (Veh/km)") plt.ylabel("Speed (Km/hr)") plt.show() plt.scatter(lMD, lMF) plt.xlabel("Density (Veh/km)") plt.ylabel("Flow (Veh/hr)") plt.show() plt.scatter(lMS, lMF) plt.xlabel("Speed (Km/hr)") plt.ylabel("Flow (Veh/hr)") plt.show() # Build a csv file Macro_Features = { 'Density': MD, 'Speed': MS, 'Flow': MF, } df = pd.DataFrame(Macro_Features, columns=['Density', 'Speed', 'Flow']) df.to_csv('Macro_density.csv') # Build a csv file Macro_Features = { 'Density': lMD, 'Speed': lMS, 'Flow': lMF, } df = pd.DataFrame(Macro_Features, columns=['Density', 'Speed', 'Flow']) df.to_csv('Macro_lanedensity.csv')
def load_prepare_data(self): """ Prepare dataframe with useful information from event xml Parameters ---------- xml_name : str Name of seiscomp like event xml file Returns ------- pandas.Dataframe Dataframe with useful information about events in the xml """ # loading events dataframe df = pdx.read_xml(self.xml_path, ['seiscomp', 'EventParameters', 'event']) # loading origins dataframe df_or = pdx.read_xml(self.xml_path, ['seiscomp', 'EventParameters', 'origin']) # preparing events dataframe df['region'] = df['description'].apply(self.get_region_author, key='text') df['author'] = df['creationInfo'].apply(self.get_region_author, key='author') df = df[['@publicID', 'preferredOriginID', 'author', 'region']] # preparing origins dataframe df_or['lat'] = df_or['latitude'].apply(self.get_from_dict, key='value') df_or['lon'] = df_or['longitude'].apply(self.get_from_dict, key='value') df_or['z'] = df_or['depth'].apply(self.get_from_dict, key='value') df_or['orig_time'] = df_or['time'].apply(self.get_from_dict, key='value', time=True) df_or['orig_time'] = pd.to_datetime(df_or['orig_time']) df_or['lat_e'] = df_or['latitude'].apply(self.get_from_dict, key='uncertainty') df_or['lon_e'] = df_or['longitude'].apply(self.get_from_dict, key='uncertainty') df_or['z_e'] = df_or['depth'].apply(self.get_from_dict, key='uncertainty') df_or['t_e'] = df_or['quality'].apply(self.get_from_dict, key='standardError') df_or['min_dis'] = df_or['quality'].apply(self.get_from_dict, key='minimumDistance') df_or['phasecount'] = df_or['quality'].apply(self.get_from_dict, key='usedPhaseCount') df_or['stationcount'] = df_or['quality'].apply(self.get_from_dict, key='usedStationCount') df_or['mag'] = df_or['magnitude'].apply(self.get_mag, key='value') df_or['mag_e'] = df_or['magnitude'].apply(self.get_mag, key='uncertainty') df_or['mag_count'] = df_or['magnitude'].apply(self.get_mag_count) df_or = df_or[[ '@publicID', 'orig_time', 'mag', 'lat', 'lon', 'z', 'lat_e', 'lon_e', 'z_e', 'min_dis', 'phasecount', 'stationcount', 'mag_e', 't_e' ]] df_or.rename(columns={'@publicID': 'originID'}, inplace=True) # mergin the two dataframes df_merge = pd.merge(df, df_or, how='inner', left_on='preferredOriginID', right_on='originID') n = len(df_merge) print(f'\nGuarndando eventos ene el archivo {self.csv_path}\n') print(f'\n\n\tSe encontraron un total de {n} eventos\n') df_merge.to_csv(self.csv_path)
# I hate XML. # ========= # LIBRARIES # ========= import pandas # For fancy dataframes. import pandas_read_xml as pdx # To convert XML to fancy dataframes. import json # For JSON # ============= # READ XML FILE # ============= statObjectDefsDataFrame = pdx.read_xml( "xml/StatObjectDefinitions.xml", ["root", "stat_object_definitions", "stat_object_definition"]) # =================== # WRITE JSON CONTENTS # =================== # Stat Object Definitions # ----------------------- jsonContent = {} maps = {} for index, content in statObjectDefsDataFrame.iterrows(): # Extract data as data-frame fieldDefsDataFrame = pandas.DataFrame.from_dict(
from ast import literal_eval import glob import pandas_read_xml as pdx from tqdm import tqdm import base64 xml_normal = glob.glob('./data/train/normal/*.xml') xml_arr = glob.glob('./data/train/arrhythmia/*.xml') xml_train = xml_normal + xml_arr train_array = np.empty((0,8,1500)) #train_array for xml in tqdm(xml_train[24369:]): Waveform = pdx.read_xml(xml, ['RestingECG', 'Waveform']) Rhythm = Waveform[1] df_rhythm = pd.json_normalize(Rhythm) rhythm_Lead = pd.json_normalize(df_rhythm['LeadData']) df_RL = pd.DataFrame() for i in range(len(rhythm_Lead.columns)): df_normal = pd.json_normalize(rhythm_Lead[i]) df_RL = df_RL.append(df_normal) #rhythm_Lead = rhythm_Lead.drop([2,3,4,5]) df_rhythm_sum = pd.concat([df_rhythm.iloc[:,:-1],df_RL],1) df_rhythm_sum = df_RL df_rhythm_sum.reset_index(drop=True,inplace=True) x = np.array(0) rhythm_dataset = np.empty((0,1500))
# unzip folder zip = zipfile.ZipFile(save_path) zip.extractall(save_folder) # import list of xml files xml_files = [] for name in glob.glob(save_folder + "/OPEN_DATA_*.xml"): #xml_files.append(pdx.read_xml(name, ["cc:corpcan", "corporations"], encoding="utf8"))# import data from federal xml_files.append(name) # create df of corporate names and ids and write to .csv df_names_ids = pd.DataFrame(columns=["corp_id", "businessNumber"]) for file in xml_files: print(file) corp_canada_df = pdx.read_xml(file, ["cc:corpcan", "corporations"], encoding="utf8") corp_id = [] businessNumber = [] for i in range(corp_canada_df.size): #if "businessNumbers" in corp_canada_df["corporation"][i].keys(): try: bn = corp_canada_df["corporation"][i]["businessNumbers"][ "businessNumber"] businessNumber.append(bn) corp_id.append(corp_canada_df["corporation"][i]["@corporationId"]) except: businessNumber.append(None) corp_id.append(corp_canada_df["corporation"][i]["@corporationId"]) df_file = pd.DataFrame({ "corp_id": corp_id,
class req: def __init__(self,PageNo=0, Rows=500,DateFrom=20200310,DateTo=0): DateTo = time.strftime('%Y%m%d',time.localtime(time.time())) query = {'serviceKey': ServiceKey, 'PageNo':PageNo,'numOfRows':Rows,'startCreateDt':DateFrom,'endCreateDt':DateTo} self.resp = get(url=url,params=query) a = req(PageNo=1, DateFrom=20200311) response = a.resp.text df0 = pd.read_csv("https://pastebin.com/raw/FFzk1m53") df = pdx.read_xml(response,['response','body','items','item']) df = df.drop_duplicates('stateDt', keep='first') now = dt.datetime.now() print(now) ######################### print(dt.datetime.hour) ################ print(now.hour) ################ nHour = (now).strftime("%Y-%m-%d") if now.hour<10 else (now+dt.timedelta(days=1)).strftime("%Y-%m-%d") nTime = now-dt.timedelta(days=1) if now.hour<10 else now sp = np.datetime_as_string(np.arange('2020-03-11',nHour ,dtype='datetime64[D]'),unit='D') ################# sp = np.flip(sp) ############### df['stateDt'] = sp df = df.iloc[::-1] df = pd.concat([df0,df],ignore_index=True) df = df.drop_duplicates('stateDt', keep='first') print(sp)
def raw(self): """ Raw data from XML file """ raw = pdx.read_xml(self.url, ['proposicao', 'Votacoes', 'Votacao']) return raw
import pandas_read_xml as pdx df = pdx.read_xml("data.xml") print(df.to_json())
def get_dataframe(file_name): df = pdx.read_xml('D:\PycharmProjects\pythonProject\config\\' + file_name + '.xml', ['lvr_land', '買賣'], encoding='UTF-8') return df
def find_pandas(xbrl_str): print("Hi") test_zip_path = xbrl_str root_key_list = ['F_000151'] df = pdx.read_xml(test_zip_path, root_key_list, transpose=True) print(df)
from django.shortcuts import render import spacy import xml.etree.ElementTree as ET import pandas_read_xml as pdx nlp = spacy.load("en_core_web_trf") print("nlp loaded") url = 'http://www.orphadata.org/data/xml/en_product4.xml' df = pdx.read_xml(url, ['JDBOR', 'HPODisorderSetStatusList'], encoding='cp1252') print("xml read") # Create your views here. def index(request): return render(request, 'index.html') def result(request): testquery = str(request.POST.get('testquery', False)) nouns = getNouns(testquery) result = conditionsFromMedicalTerms(nouns) return render(request, 'result.html', {'result': result}) def getNouns(sentence): nlpSentence = nlp(sentence) nouns = [] for token in nlpSentence:
import pandas as pd import pandas_read_xml as pdx from pandas_read_xml import auto_separate_tables file_path = 'data_science_learning/BOM&Wire analysis/ELZ_TAB016161D_KSK_L0L_160620.xml' root_key_list = ['HarnessContainer', 'Harness', 'Connectors'] key_columns = ['Connector', 'ConnectorModuleRefs|ConnectorModuleRef'] df = pdx.read_xml(file_path, root_key_list) data = df.pipe(auto_separate_tables, key_columns) # data.to_excel('data_science_learning/pandas_read_xml/apc200219_new.xlsx') print(data.keys())
pred_speaker = "Biden" elif pred_speaker[0] == 1: pred_speaker = "Trump" st.write( f"With >80% accuracy, we predict that the message was said by {pred_speaker}." ) # Prevents running errors before user has entered data file URL if (xml_path == ""): st.write("No data entered.") tb_main() st.stop() try: df_sms = pdx.read_xml("https://drive.google.com/uc?id=" + xml_path, encoding="utf8") name, df = add_features(df_sms) except: df_sms = pdx.read_xml(xml_path, encoding="utf8") name, df = add_features(df_sms) df['person'] = df['person'].astype(int) first = df['person'][0] countt = [0, 0] who_spoke = [[], []] for i in range(len(df['person'])): if df['person'][i] == first: countt[0] += 1 who_spoke[0].append(i) else: countt[1] += 1
images_path = "C:\\Users\\hsnyt\\Desktop\\trafic\\azami hız 30" labels_path = "C:\\Users\\hsnyt\\Desktop\\trafic\\txtdosya" #kırpılmış dosyaların olacağı klasör new_image_path = 'C:\\Users\\hsnyt\\Desktop\\hazır\\' (klasorler, dosyalar) = ayristir(labels_path) # github repodaki 0,1 formatında hazırlandı label_names = [ "Sol", "Sag", "IleriSol", "IleriSag", "SolaDonulmez", "SagaDonulmez", "Girilmez", "TrafikKapalı", "Dur", "ParkYapılmaz", "ParkYeri", "HizSiniri30", "HizSiniri40", "HizSiniri20Bitti", "Durak" ] for i in dosyalar: df = pdx.read_xml(os.path.join(labels_path, i)) image_name = df.loc['object'].loc['filename'] image_number = 0 xmin = int(df.loc['object'].loc['annotation'][0]['bndbox']['xmin']) ymin = int(df.loc['object'].loc['annotation'][0]['bndbox']['ymin']) xmax = int(df.loc['object'].loc['annotation'][0]['bndbox']['xmax']) ymax = int(df.loc['object'].loc['annotation'][0]['bndbox']['ymax']) crop_box = (xmin, ymin, xmax, ymax) print(crop_box) im = im.crop(crop_box)
def generatOneRow(filePath: str) -> pd.DataFrame: # Create a dataframe from a xml file df_singleRowXml = pdx.read_xml(filePath) df_singleRowXml = pdx.auto_flatten(df_singleRowXml).dropna(axis=1) df_singleRowXml = cleanOneRow(df_singleRowXml) return df_singleRowXml