/
DAG file.py
144 lines (119 loc) · 6.89 KB
/
DAG file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import datetime
import os
import airflow
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
import pandas as pd
import os
from pandas import ExcelWriter
from os import listdir
from os.path import isfile, join
import time
import sys
import numpy as np
YESTERDAY = datetime.datetime.now() - datetime.timedelta(days=1)
default_args = {
'owner': 'Composer Example',
'depends_on_past': False,
'email': [''],
'email_on_failure': "ashutosh.dash3@vodafone.com",
'email_on_retry': False,
'retries': 1,
'retry_delay': datetime.timedelta(minutes=5),
'start_date': YESTERDAY,
}
def ModifyInputfiles():
def get_excel_files(path):
return [f for f in listdir(path) if isfile(join(path, f))]
dirpath = '/home/airflow/gcs/data/'
print("Dirpath:"+dirpath)
if not os.path.exists(os.path.join(dirpath,'in-data')):
os.mkdir(os.path.join(dirpath,'in-data'))
OutputFolder = os.path.join(dirpath,'in-data')
inputfilepath = '/home/airflow/gcs/data/input'
mappingFilePath = '/home/airflow/gcs/data/map'
mapping = get_excel_files(mappingFilePath)
inputfilelist = get_excel_files(inputfilepath)
for map_file in mapping: #if in future we need to take multiple files
print("running file:" + map_file)
mapping_exten = map_file.split('.')[1]
if mapping_exten == 'csv':
mapping_df = pd.read_csv(os.path.join(mappingFilePath,map_file),sep=';')
elif mapping_exten == 'xlsx':
mapping_df = pd.read_excel(os.path.join(mappingFilePath,map_file))
mapping_df = mapping_df.loc[:,mapping_df.columns.intersection(['VM','VNF Name','VNF Program name','VNF vendor name','Site Name'])]
mapping_df =mapping_df.rename(columns = {'VNF Program name':'Program Name' ,'VNF vendor name' :'Vendor'})
writer = ExcelWriter(os.path.join(OutputFolder,'FS_Input_'+time.strftime("%Y_%m_%d")+'.xlsx'))
final_input_df = pd.DataFrame()
for input in inputfilelist: #if in future we need to take multiple files
print("Running file: "+input)
input_exten = input.split('.')[1]
file_name = input.split('.')[0]
site_name = file_name.split('_')[0]
if input_exten == 'csv':
input_df = pd.read_csv(os.path.join(inputfilepath , input),sep=';')
elif input_exten == 'xlsx':
input_df = pd.read_excel(os.path.join(inputfilepath , input))
input_df = input_df.rename(columns=lambda x: x.strip())
input_df['Disk Average Read Bytes Per Second (kB/s)'] = pd.to_numeric(input_df['Disk Average Read Bytes Per Second (kB/s)'].astype(str).replace('.',''), errors='coerce')
input_df['Disk Average Read Bytes Per Second (kB/s)'] = pd.to_numeric(input_df['Disk Average Read Bytes Per Second (kB/s)'], downcast="float")
input_df['Disk Average Write Bytes Per Second (kB/s)'] = pd.to_numeric(input_df['Disk Average Write Bytes Per Second (kB/s)'].astype(str).replace('.',''), errors='coerce')
input_df['Disk Average Write Bytes Per Second (kB/s)'] = pd.to_numeric(input_df['Disk Average Write Bytes Per Second (kB/s)'], downcast="float")
#input_df['Site Name'] = site_name
input_df = input_df.rename(columns=lambda x: x.lower())
final_input_df = final_input_df.append(input_df)
final_input_df =final_input_df.rename(columns = {'site name':'SiteName','start time':'timestamp','vm':'VM','cpu usage %':'CPU usage (%)','memory usage %':'Memory Usage (%)','disk usage %':'Disk Usage (%)','disk average read bytes per second (kb/s)':'Disk Average Read Bytes Per Second (kB/s)','disk average write bytes per second (kb/s)':'Disk Average Write Bytes Per Second (kB/s)'})
final_input_df = pd.merge(final_input_df,mapping_df, on='VM', how='left')
final_input_df['Disk Usage (%)'] = final_input_df['Disk Usage (%)'].convert_objects(convert_numeric=True)
final_input_df['Memory Usage (%)'] = final_input_df['Memory Usage (%)'].convert_objects(convert_numeric=True)
final_input_df['CPU usage (%)'] = final_input_df['CPU usage (%)'].convert_objects(convert_numeric=True)
final_input_df['Disk Usage (%)'] = np.round(final_input_df['Disk Usage (%)'].astype(float) ,3)
final_input_df['Memory Usage (%)'] = np.round(final_input_df['Memory Usage (%)'].astype(float),3)
final_input_df['CPU usage (%)'] = np.round(final_input_df['CPU usage (%)'].astype(float) ,3)
final_input_df.to_excel(writer,index=False)
writer.save()
writer.close()
print('end of process')
with airflow.DAG(
'composer_FSR_RO_dag',
'catchup=False',
default_args=default_args,
schedule_interval='*/5 * * * *') as dag:
OPCO = 'ro'
date = datetime
path = "/home/airflow/gcs/data/input"
if os.path.exists(path) and not os.path.isfile(path):
run_command0 = "rm -rv /home/airflow/gcs/data/map/*" "/home/airflow/gcs/data/input/* /home/airflow/gcs/data/scripts/OPCO_OUTPUT/*"
if os.listdir(path): #if not empty direcctory
t1 = BashOperator(
task_id= 'cleaning_previous_files',
bash_command= run_command0,
dag=dag)
else:
run_command0 = "mkdir /home/airflow/gcs/data/map/ /home/airflow/gcs/data/input/"
t1 = BashOperator(
task_id= 'creating_the_paths',
bash_command= run_command0,
dag=dag)
run_command2 = "gsutil cp gs://fushionsphere/MappingFiles/ro/ro_mapping.csv /home/airflow/gcs/data/map &&" " gsutil cp gs://fushionsphere/InputFile/ro/* /home/airflow/gcs/data/input"
t2 = BashOperator(
task_id= 'getting_map_input_data',
bash_command= run_command2,
dag=dag)
run_command3 = "python3 ~airflow/dags/ModifyInput.py /home/airflow/gcs/data/input /home/airflow/gcs/data/map"
#t3 = PythonOperator(task_id='input_data_engineering', python_callable=ModifyInputfiles)
t3 = BashOperator(
task_id= 'input_data_engineering',
bash_command= run_command3,
dag=dag)
run_command4 = "python /home/airflow/gcs/data/scripts/FSRScript.py /home/airflow/gcs/data/in-data ${OPCO}"
t4 = BashOperator(
task_id= 'FS_report_run',
bash_command= run_command4,
dag=dag)
run_command5 = "tar -czvf ${OPCO}_FSR_Monthly_$date.tar.gz /home/airflow/gcs/data/scripts/OPCO_OUTPUT/* &&" "gsutil cp /home/airflow/gcs/data/scripts/OPCO_OUTPUT/${OPCO}_FSR_Monthly_$date.tar.gz gs://fushionsphere/Output/"
t5 = BashOperator(
task_id= 'uploading_report_to_GCP',
bash_command= run_command5,
dag=dag)
t2 >> t3 >> t4 >> t5