Пример #1
0
def behavior(start_date, end_date, account_name, property_name, profile_name, max_results):
    # Let pandas fetch the data from Google Analytics, returns a generator object
    df_chunks = ga.read_ga(secrets=client_secrets,
                           account_name=account_name,
                           property_name=property_name,
                           profile_name=profile_name,
                           dimensions=['date', 'hour', 'minute'],
                           metrics=['pageviews'],
                           start_date=start_date,
                           end_date=end_date,
                           index_col=0,
                           parse_dates={'datetime': ['date', 'hour', 'minute']},
                           date_parser=lambda x: datetime.strptime(x, '%Y%m%d %H %M'),
                           max_results=max_results,
                           chunksize=10000)

    # Concatenate the chunks into a DataFrame and get number of rows
    df = pd.concat(df_chunks)
    num_rows = df.shape[0]

    # Resample into half-hour buckets
    df = df.resample('30Min', how='sum')

    # Create the behavior table (half-hour x weekday)
    grouped = df.groupby([df.index.time, df.index.weekday])
    behavior = grouped['pageviews'].aggregate(np.sum).unstack()

    # Make sure the table covers all hours and weekdays
    behavior = behavior.reindex(index=pd.date_range("00:00", "23:30", freq="30min").time,
                                columns=range(7))
    behavior.columns = ['MO', 'TU', 'WE', 'TH', 'FR', 'SA', 'SU']

    return behavior, num_rows
Пример #2
0
    def test_getdata(self):
        try:
            import httplib2
            from pandas.io.ga import GAnalytics, read_ga
            from pandas.io.auth import AuthenticationConfigError
        except ImportError:
            raise nose.SkipTest

        try:
            end_date = datetime.now()
            start_date = end_date - pd.offsets.Day() * 5
            end_date = end_date.strftime('%Y-%m-%d')
            start_date = start_date.strftime('%Y-%m-%d')

            reader = GAnalytics()
            df = reader.get_data(metrics=[
                'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'
            ],
                                 start_date=start_date,
                                 end_date=end_date,
                                 dimensions=['date', 'hour'],
                                 parse_dates={'ts': ['date', 'hour']})

            assert isinstance(df, DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            assert len(df) > 1
            assert 'date' not in df
            assert 'hour' not in df
            assert df.index.name == 'ts'
            assert 'avgTimeOnSite' in df
            assert 'visitors' in df
            assert 'newVisits' in df
            assert 'pageviewsPerVisit' in df

            df2 = read_ga(metrics=[
                'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'
            ],
                          start_date=start_date,
                          end_date=end_date,
                          dimensions=['date', 'hour'],
                          parse_dates={'ts': ['date', 'hour']})

            assert_frame_equal(df, df2)

        except AuthenticationConfigError:
            raise nose.SkipTest
        except httplib2.ServerNotFoundError:
            try:
                h = httplib2.Http()
                response, content = h.request("http://www.google.com")
                raise
            except httplib2.ServerNotFoundError:
                raise nose.SkipTest
Пример #3
0
    def test_segment(self):
        try:
            end_date = datetime.now()
            start_date = end_date - pd.offsets.Day() * 5
            end_date = end_date.strftime('%Y-%m-%d')
            start_date = start_date.strftime('%Y-%m-%d')

            reader = GAnalytics()
            df = reader.get_data(metrics=[
                'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'
            ],
                                 start_date=start_date,
                                 end_date=end_date,
                                 segment=-2,
                                 dimensions=['date', 'hour'],
                                 parse_dates={'ts': ['date', 'hour']},
                                 index_col=0)

            self.assertIsInstance(df, pd.DataFrame)
            self.assertIsInstance(df.index, pd.DatetimeIndex)
            self.assertGreater(len(df), 1)
            self.assertTrue('date' not in df)
            self.assertTrue('hour' not in df)
            self.assertEqual(df.index.name, 'ts')
            self.assertTrue('avgTimeOnSite' in df)
            self.assertTrue('visitors' in df)
            self.assertTrue('newVisits' in df)
            self.assertTrue('pageviewsPerVisit' in df)

            # dynamic
            df = read_ga(metrics=[
                'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'
            ],
                         start_date=start_date,
                         end_date=end_date,
                         segment="source=~twitter",
                         dimensions=['date', 'hour'],
                         parse_dates={'ts': ['date', 'hour']},
                         index_col=0)

            assert isinstance(df, pd.DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            self.assertGreater(len(df), 1)
            self.assertTrue('date' not in df)
            self.assertTrue('hour' not in df)
            self.assertEqual(df.index.name, 'ts')
            self.assertTrue('avgTimeOnSite' in df)
            self.assertTrue('visitors' in df)
            self.assertTrue('newVisits' in df)
            self.assertTrue('pageviewsPerVisit' in df)

        except AuthenticationConfigError:
            raise nose.SkipTest("authentication error")
Пример #4
0
    def test_getdata(self):
        try:
            import httplib2
            from pandas.io.ga import GAnalytics, read_ga
            from pandas.io.auth import AuthenticationConfigError
        except ImportError:
            raise nose.SkipTest

        try:
            end_date = datetime.now()
            start_date = end_date - pd.offsets.Day() * 5
            end_date = end_date.strftime('%Y-%m-%d')
            start_date = start_date.strftime('%Y-%m-%d')

            reader = GAnalytics()
            df = reader.get_data(
                metrics=['avgTimeOnSite', 'visitors', 'newVisits',
                         'pageviewsPerVisit'],
                start_date=start_date,
                end_date=end_date,
                dimensions=['date', 'hour'],
                parse_dates={'ts': ['date', 'hour']})

            assert isinstance(df, DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            assert len(df) > 1
            assert 'date' not in df
            assert 'hour' not in df
            assert df.index.name == 'ts'
            assert 'avgTimeOnSite' in df
            assert 'visitors' in df
            assert 'newVisits' in df
            assert 'pageviewsPerVisit' in df

            df2 = read_ga(
                metrics=['avgTimeOnSite', 'visitors', 'newVisits',
                         'pageviewsPerVisit'],
                start_date=start_date,
                end_date=end_date,
                dimensions=['date', 'hour'],
                parse_dates={'ts': ['date', 'hour']})

            assert_frame_equal(df, df2)

        except AuthenticationConfigError:
            raise nose.SkipTest
        except httplib2.ServerNotFoundError:
            try:
                h = httplib2.Http()
                response, content = h.request("http://www.google.com")
                raise
            except httplib2.ServerNotFoundError:
                raise nose.SkipTest
Пример #5
0
    def test_segment(self):
        try:
            end_date = datetime.now()
            start_date = end_date - pd.offsets.Day() * 5
            end_date = end_date.strftime('%Y-%m-%d')
            start_date = start_date.strftime('%Y-%m-%d')

            reader = GAnalytics()
            df = reader.get_data(
                metrics=['avgTimeOnSite', 'visitors', 'newVisits',
                         'pageviewsPerVisit'],
                start_date=start_date,
                end_date=end_date,
                segment=-2,
                dimensions=['date', 'hour'],
                parse_dates={'ts': ['date', 'hour']},
                index_col=0)

            self.assertIsInstance(df, pd.DataFrame)
            self.assertIsInstance(df.index, pd.DatetimeIndex)
            self.assertGreater(len(df), 1)
            self.assertTrue('date' not in df)
            self.assertTrue('hour' not in df)
            self.assertEqual(df.index.name, 'ts')
            self.assertTrue('avgTimeOnSite' in df)
            self.assertTrue('visitors' in df)
            self.assertTrue('newVisits' in df)
            self.assertTrue('pageviewsPerVisit' in df)

            # dynamic
            df = read_ga(
                metrics=['avgTimeOnSite', 'visitors', 'newVisits',
                         'pageviewsPerVisit'],
                start_date=start_date,
                end_date=end_date,
                segment="source=~twitter",
                dimensions=['date', 'hour'],
                parse_dates={'ts': ['date', 'hour']},
                index_col=0)

            assert isinstance(df, pd.DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            self.assertGreater(len(df), 1)
            self.assertTrue('date' not in df)
            self.assertTrue('hour' not in df)
            self.assertEqual(df.index.name, 'ts')
            self.assertTrue('avgTimeOnSite' in df)
            self.assertTrue('visitors' in df)
            self.assertTrue('newVisits' in df)
            self.assertTrue('pageviewsPerVisit' in df)

        except AuthenticationConfigError:
            raise nose.SkipTest("authentication error")
Пример #6
0
    def test_segment(self):
        try:
            end_date = datetime.now()
            start_date = end_date - pd.offsets.Day() * 5
            end_date = end_date.strftime('%Y-%m-%d')
            start_date = start_date.strftime('%Y-%m-%d')

            reader = GAnalytics()
            df = reader.get_data(
                metrics=['avgTimeOnSite', 'visitors', 'newVisits',
                         'pageviewsPerVisit'],
                start_date=start_date,
                end_date=end_date,
                segment=-2,
                dimensions=['date', 'hour'],
                parse_dates={'ts': ['date', 'hour']})

            assert isinstance(df, DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            assert len(df) > 1
            assert 'date' not in df
            assert 'hour' not in df
            assert df.index.name == 'ts'
            assert 'avgTimeOnSite' in df
            assert 'visitors' in df
            assert 'newVisits' in df
            assert 'pageviewsPerVisit' in df

            #dynamic
            df = read_ga(
                metrics=['avgTimeOnSite', 'visitors', 'newVisits',
                         'pageviewsPerVisit'],
                start_date=start_date,
                end_date=end_date,
                segment="source=~twitter",
                dimensions=['date', 'hour'],
                parse_dates={'ts': ['date', 'hour']})

            assert isinstance(df, DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            assert len(df) > 1
            assert 'date' not in df
            assert 'hour' not in df
            assert df.index.name == 'ts'
            assert 'avgTimeOnSite' in df
            assert 'visitors' in df
            assert 'newVisits' in df
            assert 'pageviewsPerVisit' in df

        except AuthenticationConfigError:
            raise nose.SkipTest
Пример #7
0
    def test_segment(self):
        try:
            end_date = datetime.now()
            start_date = end_date - pd.offsets.Day() * 5
            end_date = end_date.strftime('%Y-%m-%d')
            start_date = start_date.strftime('%Y-%m-%d')

            reader = GAnalytics()
            df = reader.get_data(metrics=[
                'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'
            ],
                                 start_date=start_date,
                                 end_date=end_date,
                                 segment=-2,
                                 dimensions=['date', 'hour'],
                                 parse_dates={'ts': ['date', 'hour']})

            assert isinstance(df, DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            assert len(df) > 1
            assert 'date' not in df
            assert 'hour' not in df
            assert df.index.name == 'ts'
            assert 'avgTimeOnSite' in df
            assert 'visitors' in df
            assert 'newVisits' in df
            assert 'pageviewsPerVisit' in df

            #dynamic
            df = read_ga(metrics=[
                'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'
            ],
                         start_date=start_date,
                         end_date=end_date,
                         segment="source=~twitter",
                         dimensions=['date', 'hour'],
                         parse_dates={'ts': ['date', 'hour']})

            assert isinstance(df, DataFrame)
            assert isinstance(df.index, pd.DatetimeIndex)
            assert len(df) > 1
            assert 'date' not in df
            assert 'hour' not in df
            assert df.index.name == 'ts'
            assert 'avgTimeOnSite' in df
            assert 'visitors' in df
            assert 'newVisits' in df
            assert 'pageviewsPerVisit' in df

        except AuthenticationConfigError:
            raise nose.SkipTest
Пример #8
0
def behavior(start_date, end_date, account_name, property_name, profile_name,
             max_results):
    """
    Writes a DataFrame with the number of pageviews per half-hours x weekdays
    to the Range "behavior"
    """
    # Let pandas fetch the data from Google Analytics, returns a generator object
    df_chunks = ga.read_ga(
        secrets=client_secrets,
        account_name=account_name,
        property_name=property_name,
        profile_name=profile_name,
        dimensions=['date', 'hour', 'minute'],
        metrics=['pageviews'],
        start_date=start_date,
        end_date=end_date,
        index_col=0,
        parse_dates={'datetime': ['date', 'hour', 'minute']},
        date_parser=lambda x: datetime.strptime(x, '%Y%m%d %H %M'),
        max_results=max_results,
        chunksize=10000)

    # Concatenate the chunks into a DataFrame and get number of rows
    df = pd.concat(df_chunks)
    num_rows = df.shape[0]

    # Resample into half-hour buckets
    df = df.resample('30Min', how='sum')

    # Create the behavior table (half-hour x weekday)
    grouped = df.groupby([df.index.time, df.index.weekday])
    behavior = grouped['pageviews'].aggregate(np.sum).unstack()

    # Make sure the table covers all hours and weekdays
    behavior = behavior.reindex(index=pd.date_range("00:00",
                                                    "23:30",
                                                    freq="30min").time,
                                columns=range(7))
    behavior.columns = ['MO', 'TU', 'WE', 'TH', 'FR', 'SA', 'SU']

    # Write to Excel.
    # Time-only values are currently a bit of a pain on Windows, so we set index=False.
    Range(sheet_dashboard, 'behavior', index=False).value = behavior
    Range(sheet_dashboard, 'effective').value = num_rows
Пример #9
0
def behavior(start_date, end_date, account_name, property_name, profile_name, max_results):
    """
    Writes a DataFrame with the number of pageviews per half-hours x weekdays
    to the Range "behavior"
    """
    # Let pandas fetch the data from Google Analytics, returns a generator object
    df_chunks = ga.read_ga(secrets=client_secrets,
                           account_name=account_name,
                           property_name=property_name,
                           profile_name=profile_name,
                           dimensions=['date', 'hour', 'minute'],
                           metrics=['pageviews'],
                           start_date=start_date,
                           end_date=end_date,
                           index_col=0,
                           parse_dates={'datetime': ['date', 'hour', 'minute']},
                           date_parser=lambda x: datetime.strptime(x, '%Y%m%d %H %M'),
                           max_results=max_results,
                           chunksize=10000)

    # Concatenate the chunks into a DataFrame and get number of rows
    df = pd.concat(df_chunks)
    num_rows = df.shape[0]

    # Resample into half-hour buckets
    df = df.resample('30Min', how='sum')

    # Create the behavior table (half-hour x weekday)
    grouped = df.groupby([df.index.time, df.index.weekday])
    behavior = grouped['pageviews'].aggregate(np.sum).unstack()

    # Make sure the table covers all hours and weekdays
    behavior = behavior.reindex(index=pd.date_range("00:00", "23:30", freq="30min").time,
                                columns=range(7))
    behavior.columns = ['MO', 'TU', 'WE', 'TH', 'FR', 'SA', 'SU']

    # Write to Excel.
    # Time-only values are currently a bit of a pain on Windows, so we set index=False.
    Range(sheet_dashboard, 'behavior').options(index=False).value = behavior
    Range(sheet_dashboard, 'effective').value = num_rows
Пример #10
0
#define the dimensions and metrics to use on the analytics api
dimensions = ['pagePath']
metrics = [
    'pageviews', 'uniquePageviews', 'entrances', 'bounceRate', 'exitRate'
]
start_date = '2014-08-01'
end_date = '2014-08-31'

df1 = pd.DataFrame(columns=[
    'pageviews'
])  #dataframe with one series [column] - nameing the column pageviews
count = 0  #create a counter variable

for i in df[
        0]:  #loop through the dataframe df column to pass each slug into the analytics filter field separately
    hwy = ga.read_ga(
        metrics=metrics,  #api call
        dimensions=dimensions,
        start_date=start_date,
        end_date=end_date,
        filters=['pagePath==' + i],
        account_id='26179049')
    df1 = df1.append(
        hwy
    )  #each time the api is called a dataframe [hwy] is created - we append this dataframe to df1 on each pass of the loop
    count += 1  #counts the number of times the loop is run through
    print count  #prints out the number of times the loop is run through

print df1.head()  #checks the first 5 results in the dataframe
Пример #11
0
def PageData(page, mod):
	'''
	args - pagepath for filter
		 - Model type - takes 'LR' for LASSO and 'RF' for Random Forests
	'''
	
	
	Store = []
	t = datetime.today()
	t2 = t - timedelta(hours=1)  #(2 hours for BST, 1 for UTC which is on Heroku server)
	delay = t2.strftime('%Y-%m-%d %H:00')
	star = t - timedelta(30)


	max_results = 5e7
	metrics = ['pageviews']
	dimensions = ['pagePath', 'hour', 'date']
	dim = ['date', 'hour']
	
	if page != None:
		filters = ['pagePath=='+page]
	else:
		filters = None

	df1 = ga.read_ga(metrics,
						 dimensions = dim,
						 start_date = star, 
						 end_date = delay, 
						 parse_dates=[['date', 'hour']],
						 token_file_name = 'static/token/analytics.dat',
						 secrets = 'static/token/client_secrets.json',
						 account_id = '26179049',
						 filters = filters
						 )

	##################### 48 MAX LAG ##############################

	ind = []

	for i in range(72, len(df1)):

		lag = [1,2,3,4,5,10,17,24,48,72]
		lagx = list(i - np.array(lag))
	
		Train = df1.ix[lagx]
		Target = df1.ix[i]

		TT = Train.T
		TT.columns = lag
		TT['Target'] = Target['pageviews']
		ind.append(TT)

	rng = date_range(df1.index[lag[-1]], df1.index[len(df1)-1], freq='H')
	Set = ind[0].append(ind[1:])
	Set.index = rng
	SetT = Set.ix[:delay]
	print SetT

	#############################################################

	li = []
	
	if mod == 'LR':
		cnt = 1
	else:
		cnt = 3
	
	feats = len(lag)
	SetT = SetT.replace(0,1)

	X_Train = SetT[SetT.columns[0:feats]][:-170]
	Y_Train = SetT['Target'].ix[:-170]


	X_Test = SetT[SetT.columns[0:feats]][-170:]
	Y_Test = SetT['Target'][-170:]
	
	Store.append(X_Train)
	Store.append(Y_Train)
	Store.append(X_Test)
	Store.append(Y_Test)
	
	
	for j in range(0,cnt):

		print j
	
		#Train Model
		# feats = len(lag)
# 		SetT = SetT.replace(0,1)
# 
# 		X_Train = SetT[SetT.columns[0:feats]][:-170]
# 		Y_Train = SetT['Target'].ix[:-170]
# 
# 
# 		X_Test = SetT[SetT.columns[0:feats]][-170:]
# 		Y_Test = SetT['Target'][-170:]

		
		if mod == 'RF':
		
			print 50*'-'
			print "Random Forest Regression"
			print 50*'-'
			rf = RandomForestRegressor(n_estimators=500, max_features=feats)
		
		else:
			print 50*'-'
			print "LASSO Regression"
			print 50*'-'

			################################################################################
			# Lasso with path and cross-validation using LassoCV path
			from sklearn.linear_model import LassoCV
# 
			lasso_cv = LassoCV()
# 
			y_ = lasso_cv.fit(X_Train, Y_Train)
			rf = y_ 
			
# 			rf = linear_model.LinearRegression()
		
		
		
		
		rf.fit(X_Train, Y_Train)
		PredRF = rf.predict(X_Test)
		scoreRF = r2_score(Y_Test,PredRF)
		MSE = np.mean(np.square(Y_Test-PredRF))
		print 'R2 Score = ', scoreRF 
		print 'MSE = ',  MSE

		Res = pd.DataFrame(columns=['res'], index = (range(0,len(Y_Test))))
		resid = Y_Test-PredRF
		Res['res'] = resid.values

		TSDU = Res['res'].mean()+3*Res['res'].std()
		TSDL = Res['res'].mean()-3*Res['res'].std()
		tsdP = Res[Res['res']>(Res['res'].mean()+3*Res['res'].std())]
		tsdN = Res[Res['res']<(Res['res'].mean()-3*Res['res'].std())]

		Stats = pd.DataFrame(columns=['yt','pred','resid','TSDU','TSDL'], index=X_Test.index)
		Stats['yt'] = Y_Test
		Stats['pred'] = PredRF
		Stats['resid'] = resid
		Stats['TSDU'] = TSDU
		Stats['TSDL'] = TSDL

		######### Plotting diabled for heroku build ############################

		# plt.figure(5)
		# 
		# plt.subplot(2, 1, 1)
		# Stats['yt'].plot()
		# plt.scatter(Stats['pred'].index, Stats['pred'], s=70, alpha=0.5, c='r')
		# 
		# plt.title("Random Forest Model")
		# 
		# plt.subplot(2,1,2)

		#######################################################################

		# Stats['resid'].plot()
		# Stats['TSDU'].plot(c='r')
		# Stats['TSDL'].plot(c='r')

		Stats.index.name = 'Time'

		Stats['time'] = Stats.index
		Stats['pred'].astype('int')
		Stats['resid'].astype('int')
		Stats['TSDU'].astype('int')
		Stats['TSDL'].astype('int')

		li.append(Stats)
	# plt.title('Residuals and 2 s.d. lines')

	cat = pd.concat(([i for i in li]))
	Stats = cat.groupby(cat.index).mean()
	Stats['time'] = Stats.index

	AP = len(tsdP) + len(tsdN)

	print 50*'-'  
	# print "PAGE: " + str(filters[0])
	print 50*'-'
	print "RANDOM FOREST Number of Anomalous Points is: %i" % AP
	print "%% Anomalous points is: %.1f" % (100*AP/len(Y_Test)) +'%'
	
	
	return Stats, Store, scoreRF
Пример #12
0
def Trial(): 

	t = datetime.today()

	yesterday = t - timedelta(0)
	dbyy = t - timedelta(90)

	start  = dbyy.strftime('%Y-%m-%d')
	today = t.strftime('%Y-%m-%d %H:00')
	# end = yesterday.strftime('%Y-%m-%d')

	start = "2014-06-07"
	end = "2014-06-10"

	top100 = [u'/jobsearch', u'/search', u'/', u'/tax-disc', u'/renew-adult-passport', u'/student-finance-register-login', u'/visas-immigration', u'/driving-transaction-finished', u'/browse/abroad/passports', u'/apply-uk-visa', u'/browse/driving', u'/check-uk-visa', u'/get-a-passport-urgently', u'/apply-renew-passport', u'/government/organisations/uk-visas-and-immigration', u'/book-practical-driving-test', u'/bank-holidays', u'/change-date-practical-driving-test', u'/government/organisations/driver-and-vehicle-licensing-agency', u'/contact-jobcentre-plus', u'/book-a-driving-theory-test', u'/browse/driving/driving-licences', u'/benefits-calculators', u'/check-uk-visa/y', u'/jobseekers-allowance/how-to-claim', u'/national-minimum-wage-rates', u'/tax-credits-calculator', u'/browse/benefits/tax-credits', u'/browse/benefits', u'/change-address-driving-licence', u'/contact-the-dvla', u'/browse/working/finding-job', u'/calculate-state-pension', u'/passport-fees', u'/browse/working', u'/contact/hm-revenue-customs/tax-credits-enquiries', u'/overseas-passports', u'/track-passport-application', u'/renew-driving-licence', u'/browse/abroad', u'/get-vehicle-information-from-dvla', u'/apply-first-provisional-driving-licence', u'/student-finance/overview', u'/browse/driving/car-tax-discs', u'/general-visit-visa', u'/apply-online-to-replace-a-driving-licence', u'/government/organisations/hm-passport-office', u'/check-mot-status', u'/uk-border-control', u'/get-a-child-passport', u'/practise-your-driving-theory-test', u'/renewing-your-tax-credits-claim', u'/renewtaxcredits', u'/calculate-state-pension/y', u'/student-finance', u'/photos-for-passports', u'/contact-student-finance-england', u'/visa-processing-times', u'/foreign-travel-advice', u'/jobseekers-allowance', u'/contact', u'/browse/education/student-finance', u'/calculate-vehicle-tax-rates', u'/find-a-visa-application-centre', u'/working-tax-credit', u'/renew-driving-licence-at-70', u'/passport-advice-line', u'/call-charges', u'/overseas-passports/y', u'/countersigning-passport-applications', u'/government/topical-events/sexual-violence-in-conflict', u'/how-the-post-office-check-and-send-service-works', u'/visa-fees', u'/government/organisations', u'/browse/driving/learning-to-drive', u'/browse/working/state-pension', u'/vehicle-tax-rate-tables', u'/get-a-child-passport/your-childs-first-passport', u'/calculate-state-pension/y/age', u'/make-a-sorn', u'/jobseekers-allowance/what-youll-get', u'/general-visit-visa/apply', u'/contact/govuk/anonymous-feedback/thankyou', u'/browse/citizenship/citizenship', u'/general-visit-visa/documents-you-must-provide', u'/jobseekers-allowance/overview', u'/uk-border-control/before-you-leave-for-the-uk', u'/government/organisations/foreign-commonwealth-office', u'/government/collections/national-curriculum', u'/government/organisations/ministry-of-defence', u'/ips-regional-passport-office', u'/hand-luggage-restrictions/overview', u'/jobseekers-allowance/eligibility', u'/register-to-vote', u'/disclosure-barring-service-check/overview', u'/browse/benefits/jobseekers-allowance', u'/dvlaforms', u'/tier-4-general-visa', u'/student-finance/loans-and-grants']
	
	# the above should not be hardcoded. Calculate each time so as to avoid page name changes etc.

	max_results = 5e7
	metrics = ['pageviews']
	dimensions = ['pagePath', 'hour', 'date']
	dim = ['date', 'hour']
	filters = ['pagePath=='+top100[97]]

	###############
	#Find Top 100 pages by pageviews - (pv fine in this case rather than upv)

	# df = ga.read_ga(metrics, 
	# 					dim, 
	# 					start_date = start,
	# 					end_date = end,
	# 					token_file_name = 'static/token/analytics.dat',
	# 					secrets = 'static/token/client_secrets.json',
	# 					account_id = '26179049',
	# 					max_results=max_results,        
	#                     chunksize=5000
	# 			   )
	# df1b = pd.concat([x for x in df])
	# df1c = df1b.sort(columns=['pageviews'], ascending=0)

	df1 = ga.read_ga(metrics,
						 dimensions = dim,
						 start_date = dbyy, 
						 end_date = yesterday, 
						 parse_dates=[['date', 'hour']],
						 token_file_name = 'static/token/analytics.dat',
						 secrets = 'static/token/client_secrets.json',
						 account_id = '26179049'
	# 					 filters = filters
						 )

	##################### 48 MAX LAG ##############################

	ind = []

	for i in range(48, len(df1)):

		lag = [1,2,3,24,48]
		lagx = list(i - np.array(lag))
	
		Train = df1.ix[lagx]
		Target = df1.ix[i]

		TT = Train.T
		TT.columns = [1,2,3,24,48]
		TT['Target'] = Target['pageviews']
		ind.append(TT)

	rng = date_range(df1.index[48], df1.index[len(df1)-1], freq='H')
	Set = ind[0].append(ind[1:])
	Set.index = rng
	SetT = Set.ix[:today][:-1]
	print SetT

	##################### 7 day trial ##############################

	# ind = []
	# 
	# for i in range(168, len(df1)):
	# 
	# 	lag = [1,2,3,24,48, 168]
	# 	lagx = list(i - np.array(lag))
	# 	
	# 	Train = df1.ix[lagx]
	# 	Target = df1.ix[i]
	# 
	# 	TT = Train.T
	# 	TT.columns = [1,2,3,24,48, 168]
	# 	TT['Target'] = Target['pageviews']
	# 	ind.append(TT)
	# 
	# rng = date_range(df1.index[168], df1.index[len(df1)-1], freq='H')
	# Set = ind[0].append(ind[1:])
	# Set.index = rng
	# SetT = Set.ix[:today][:-1]
	# print SetT


	#################################################
	TrainSamp = 0.8
	feats = 5
	TS = int(np.round(TrainSamp*len(SetT)))

	X_Train = SetT[SetT.columns[0:feats]].head(TS)
	Y_Train = SetT['Target'].head(TS)

	X_Test = SetT[SetT.columns[0:feats]].ix[TS:]
	Y_Test = SetT['Target'].ix[TS:]

	X_Train = X_Train.replace(0,1)
	X_Test = X_Test.replace(0,1)
	Y_Train = Y_Train.replace(0,1)
	Y_Test = Y_Test.replace(0,1)

	print 50*'-'
	print "Random Forest Regression"
	print 50*'-'
	rf = RandomForestRegressor(n_estimators=500, max_features=feats)
	rf.fit(X_Train, Y_Train)
	PredRF = rf.predict(X_Test)
	scoreRF = r2_score(Y_Test,PredRF)
	MSE = np.mean(np.square(Y_Test-PredRF))
	print 'R2 Score = ', scoreRF 
	print 'MSE = ',  MSE

	Res = pd.DataFrame(columns=['res'], index = (range(0,len(Y_Test))))
	resid = Y_Test-PredRF
	Res['res'] = resid.values

	TSDU = Res['res'].mean()+3*Res['res'].std()
	TSDL = Res['res'].mean()-3*Res['res'].std()
	tsdP = Res[Res['res']>(Res['res'].mean()+3*Res['res'].std())]
	tsdN = Res[Res['res']<(Res['res'].mean()-3*Res['res'].std())]


	plt.figure(2)
	plt.plot(Y_Test, Y_Test)
	plt.scatter(Y_Test,PredRF, s=40, alpha=0.5, c='r')

	##############################
	plt.figure(1)

	plt.subplot(2, 2, 1)
	plt.plot(range(0,len(Y_Test)), Y_Test)
	plt.scatter(range(0,len(Y_Test)), PredRF, s=70, alpha=0.5, c='r')
	plt.xlim([0,len(Y_Test)])
	plt.title("Random Forest Model")


	plt.subplot(2,2,3)

	plt.plot(range(0,len(Y_Test)),resid)
	plt.plot(range(0,len(Y_Test)), [TSDU]*len(Y_Test), c='r')
	plt.plot(range(0,len(Y_Test)), [TSDL]*len(Y_Test), c='r')
	plt.xlim([0,len(Y_Test)])

	###############################





	AP = len(tsdP) + len(tsdN)

	print 50*'-'  
	print "PAGE: " + str(filters[0])
	print 50*'-'
	print "RANDOM FOREST Number of Anomalous Points is: %i" % AP
	print "%% Anomalous points is: %.1f" % (100*AP/len(Y_Test)) +'%'

	#################################################################

	print 50*'-'
	print "Linear Model"
	print 50*'-'
	clf = linear_model.LinearRegression()
	clf.fit(np.log(X_Train), np.log(Y_Train))
	LMPred = clf.predict(np.log(X_Test))
	scoreLM = r2_score(np.log(Y_Test),LMPred)
	MSELM =  np.mean(np.square(Y_Test-np.exp(LMPred)))
	print "R2 Score = ", scoreLM
	print "MSE = ", MSELM


	###########################################################

	print 50*'-'
	print "LASSO Regression"
	print 50*'-'


	################################################################################
	# Lasso with path and cross-validation using LassoCV path
	from sklearn.linear_model import LassoCV

	lasso_cv = LassoCV()

	y_ = lasso_cv.fit(X_Train, Y_Train).predict(X_Test)
	# .predict(X_Test)

	print "Optimal regularization parameter  = %s" % lasso_cv.alpha_

	# Compute explained variance on test data
	print "r^2 on test data : %f" % (1 - np.linalg.norm(Y_Test - y_)**2
										  / np.linalg.norm(Y_Test)**2)

	print "LASSO MSE: ", np.mean(np.square((Y_Test-(y_))))

	Res2 = pd.DataFrame(columns=['res'], index = (range(0,len(Y_Test))))
	resid2 = Y_Test-y_
	Res2['res'] = resid2.values

	TSDU2 = Res2['res'].mean()+3*Res2['res'].std()
	TSDL2 = Res2['res'].mean()-3*Res2['res'].std()
	tsdP2 = Res2[Res2['res']>(Res2['res'].mean()+3*Res2['res'].std())]
	tsdN2 = Res2[Res2['res']<(Res2['res'].mean()-3*Res2['res'].std())]

	AP = len(tsdP2) + len(tsdN2)


	print 50*'-'  
	print "PAGE: " + str(filters[0])
	print 50*'-'
	print "LASSP Number of Anomalous Points is: %i" % AP
	print "%% Anomalous points is: %.1f" % (100*AP/len(Y_Test)) +'%'

	# plt.figure(3) #################################################

	plt.subplot(2, 2, 2)
	plt.plot(range(0,len(Y_Test)), Y_Test)
	plt.scatter(range(0,len(Y_Test)), y_, s=70, alpha=0.5, c='r')
	plt.xlim([0,len(Y_Test)])
	plt.title('LASSO Model')


	plt.subplot(2,2,4)

	plt.plot(range(0,len(Y_Test)),resid2)
	plt.plot(range(0,len(Y_Test)), [TSDU2]*len(Y_Test), c='r')
	plt.plot(range(0,len(Y_Test)), [TSDL2]*len(Y_Test), c='r')
	plt.xlim([0,len(Y_Test)])
Пример #13
0
import pandas.io.ga as ga

# prints top 30 landing pages by pageviews in descending order

df = ga.read_ga(['pageviews','entrances'], 
                         dimensions=['date', 'landingPagePath'], 
                         start_date='2013-04-01')

reset_df = df.reset_index()
sorted_df = reset_df.sort(columns=['pageviews','entrances'], ascending = False)

print (sorted_df.head(30))
Пример #14
0
start_date = '2016-01-01'
end_date = '2017-01-01'

dimensions = ['date', 'cityId', 'city']

# Metrics for Request
metrics = ['sessions', 'users']

tempStartTime = start_date

count = 0
while str(tempStartTime) <= end_date:
    temp_df = ga.read_ga(account_id=account_id,
                         profile_id=view_id,
                         property_id=property_id,
                         metrics=metrics,
                         dimensions=dimensions,
                         start_date=tempStartTime,
                         end_date=today,
                         index_col=0)

    tempStartTime = temp_df.index.max()

    if count == 0:
        df = temp_df
    else:
        df = pd.concat([df, temp_df])
    count += 1
    time.sleep(.2)

# In[22]:
Пример #15
0
# -*- coding: utf-8 -*- 
import pandas.io.ga as ga
import pandas as pd

# goes through profiles,accounts and provides aggregate of visits/pageviews
ids = {'123':'456'}

filters    = "source==Facebook";"medium==Social"

all_data = pd.DataFrame()

for profile, account in ids.iteritems():    
    df = ga.read_ga(['visits', 'pageviews'], 
                             dimensions=['date', 'landingPagePath', 'medium', 'campaign', 'source'], 
                             start_date='2013-03-01', end_date='2013-04-24',
                             account_id=account, profile_id=profile, filters=filters,
                             chunksize = 1000)

    for d in df:
        d['profile'] = profile  
        d = d.reset_index()
        all_data = all_data.append(d)

all_data.to_csv('C:\\tmp\\' + '322.csv', ',', line_terminator = '\n')


    
Пример #16
0
def PageData(page, mod):
    '''
	args - pagepath for filter
		 - Model type - takes 'LR' for LASSO and 'RF' for Random Forests
	'''

    Store = []
    t = datetime.today()
    t2 = t - timedelta(
        hours=1)  #(2 hours for BST, 1 for UTC which is on Heroku server)
    delay = t2.strftime('%Y-%m-%d %H:00')
    star = t - timedelta(30)

    max_results = 5e7
    metrics = ['pageviews']
    dimensions = ['pagePath', 'hour', 'date']
    dim = ['date', 'hour']

    if page != None:
        filters = ['pagePath==' + page]
    else:
        filters = None

    df1 = ga.read_ga(metrics,
                     dimensions=dim,
                     start_date=star,
                     end_date=delay,
                     parse_dates=[['date', 'hour']],
                     token_file_name='static/token/analytics.dat',
                     secrets='static/token/client_secrets.json',
                     account_id='26179049',
                     filters=filters)

    ##################### 48 MAX LAG ##############################

    ind = []

    for i in range(72, len(df1)):

        lag = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
        lagx = list(i - np.array(lag))

        Train = df1.ix[lagx]
        Target = df1.ix[i]

        TT = Train.T
        TT.columns = lag
        TT['Target'] = Target['pageviews']
        ind.append(TT)

    rng = date_range(df1.index[lag[-1]], df1.index[len(df1) - 1], freq='H')
    Set = ind[0].append(ind[1:])
    Set.index = rng
    SetT = Set.ix[:delay]
    print SetT

    #############################################################

    li = []

    if mod == 'LR':
        cnt = 1
    else:
        cnt = 3

    feats = len(lag)
    SetT = SetT.replace(0, 1)

    X_Train = SetT[SetT.columns[0:feats]][:-170]
    Y_Train = SetT['Target'].ix[:-170]

    X_Test = SetT[SetT.columns[0:feats]][-170:]
    Y_Test = SetT['Target'][-170:]

    Store.append(X_Train)
    Store.append(Y_Train)
    Store.append(X_Test)
    Store.append(Y_Test)

    for j in range(0, cnt):

        print j

        #Train Model
        # feats = len(lag)
        # 		SetT = SetT.replace(0,1)
        #
        # 		X_Train = SetT[SetT.columns[0:feats]][:-170]
        # 		Y_Train = SetT['Target'].ix[:-170]
        #
        #
        # 		X_Test = SetT[SetT.columns[0:feats]][-170:]
        # 		Y_Test = SetT['Target'][-170:]

        if mod == 'RF':

            print 50 * '-'
            print "Random Forest Regression"
            print 50 * '-'
            rf = RandomForestRegressor(n_estimators=500, max_features=feats)

        else:
            print 50 * '-'
            print "LASSO Regression"
            print 50 * '-'

            ################################################################################
            # Lasso with path and cross-validation using LassoCV path
            from sklearn.linear_model import LassoCV
            #
            lasso_cv = LassoCV()
            #
            y_ = lasso_cv.fit(X_Train, Y_Train)
            rf = y_


# 			rf = linear_model.LinearRegression()

        rf.fit(X_Train, Y_Train)
        PredRF = rf.predict(X_Test)
        scoreRF = r2_score(Y_Test, PredRF)
        MSE = np.mean(np.square(Y_Test - PredRF))
        print 'R2 Score = ', scoreRF
        print 'MSE = ', MSE

        Res = pd.DataFrame(columns=['res'], index=(range(0, len(Y_Test))))
        resid = Y_Test - PredRF
        Res['res'] = resid.values

        TSDU = Res['res'].mean() + 3 * Res['res'].std()
        TSDL = Res['res'].mean() - 3 * Res['res'].std()
        tsdP = Res[Res['res'] > (Res['res'].mean() + 3 * Res['res'].std())]
        tsdN = Res[Res['res'] < (Res['res'].mean() - 3 * Res['res'].std())]

        Stats = pd.DataFrame(columns=['yt', 'pred', 'resid', 'TSDU', 'TSDL'],
                             index=X_Test.index)
        Stats['yt'] = Y_Test
        Stats['pred'] = PredRF
        Stats['resid'] = resid
        Stats['TSDU'] = TSDU
        Stats['TSDL'] = TSDL

        ######### Plotting diabled for heroku build ############################

        # plt.figure(5)
        #
        # plt.subplot(2, 1, 1)
        # Stats['yt'].plot()
        # plt.scatter(Stats['pred'].index, Stats['pred'], s=70, alpha=0.5, c='r')
        #
        # plt.title("Random Forest Model")
        #
        # plt.subplot(2,1,2)

        #######################################################################

        # Stats['resid'].plot()
        # Stats['TSDU'].plot(c='r')
        # Stats['TSDL'].plot(c='r')

        Stats.index.name = 'Time'

        Stats['time'] = Stats.index
        Stats['pred'].astype('int')
        Stats['resid'].astype('int')
        Stats['TSDU'].astype('int')
        Stats['TSDL'].astype('int')

        li.append(Stats)
    # plt.title('Residuals and 2 s.d. lines')

    cat = pd.concat(([i for i in li]))
    Stats = cat.groupby(cat.index).mean()
    Stats['time'] = Stats.index

    AP = len(tsdP) + len(tsdN)

    print 50 * '-'
    # print "PAGE: " + str(filters[0])
    print 50 * '-'
    print "RANDOM FOREST Number of Anomalous Points is: %i" % AP
    print "%% Anomalous points is: %.1f" % (100 * AP / len(Y_Test)) + '%'

    return Stats, Store, scoreRF
Пример #17
0
def Trial():

    t = datetime.today()

    yesterday = t - timedelta(0)
    dbyy = t - timedelta(90)

    start = dbyy.strftime('%Y-%m-%d')
    today = t.strftime('%Y-%m-%d %H:00')
    # end = yesterday.strftime('%Y-%m-%d')

    start = "2014-06-07"
    end = "2014-06-10"

    top100 = [
        u'/jobsearch', u'/search', u'/', u'/tax-disc',
        u'/renew-adult-passport', u'/student-finance-register-login',
        u'/visas-immigration', u'/driving-transaction-finished',
        u'/browse/abroad/passports', u'/apply-uk-visa', u'/browse/driving',
        u'/check-uk-visa', u'/get-a-passport-urgently',
        u'/apply-renew-passport',
        u'/government/organisations/uk-visas-and-immigration',
        u'/book-practical-driving-test', u'/bank-holidays',
        u'/change-date-practical-driving-test',
        u'/government/organisations/driver-and-vehicle-licensing-agency',
        u'/contact-jobcentre-plus', u'/book-a-driving-theory-test',
        u'/browse/driving/driving-licences', u'/benefits-calculators',
        u'/check-uk-visa/y', u'/jobseekers-allowance/how-to-claim',
        u'/national-minimum-wage-rates', u'/tax-credits-calculator',
        u'/browse/benefits/tax-credits', u'/browse/benefits',
        u'/change-address-driving-licence', u'/contact-the-dvla',
        u'/browse/working/finding-job', u'/calculate-state-pension',
        u'/passport-fees', u'/browse/working',
        u'/contact/hm-revenue-customs/tax-credits-enquiries',
        u'/overseas-passports', u'/track-passport-application',
        u'/renew-driving-licence', u'/browse/abroad',
        u'/get-vehicle-information-from-dvla',
        u'/apply-first-provisional-driving-licence',
        u'/student-finance/overview', u'/browse/driving/car-tax-discs',
        u'/general-visit-visa', u'/apply-online-to-replace-a-driving-licence',
        u'/government/organisations/hm-passport-office', u'/check-mot-status',
        u'/uk-border-control', u'/get-a-child-passport',
        u'/practise-your-driving-theory-test',
        u'/renewing-your-tax-credits-claim', u'/renewtaxcredits',
        u'/calculate-state-pension/y', u'/student-finance',
        u'/photos-for-passports', u'/contact-student-finance-england',
        u'/visa-processing-times', u'/foreign-travel-advice',
        u'/jobseekers-allowance', u'/contact',
        u'/browse/education/student-finance', u'/calculate-vehicle-tax-rates',
        u'/find-a-visa-application-centre', u'/working-tax-credit',
        u'/renew-driving-licence-at-70', u'/passport-advice-line',
        u'/call-charges', u'/overseas-passports/y',
        u'/countersigning-passport-applications',
        u'/government/topical-events/sexual-violence-in-conflict',
        u'/how-the-post-office-check-and-send-service-works', u'/visa-fees',
        u'/government/organisations', u'/browse/driving/learning-to-drive',
        u'/browse/working/state-pension', u'/vehicle-tax-rate-tables',
        u'/get-a-child-passport/your-childs-first-passport',
        u'/calculate-state-pension/y/age', u'/make-a-sorn',
        u'/jobseekers-allowance/what-youll-get', u'/general-visit-visa/apply',
        u'/contact/govuk/anonymous-feedback/thankyou',
        u'/browse/citizenship/citizenship',
        u'/general-visit-visa/documents-you-must-provide',
        u'/jobseekers-allowance/overview',
        u'/uk-border-control/before-you-leave-for-the-uk',
        u'/government/organisations/foreign-commonwealth-office',
        u'/government/collections/national-curriculum',
        u'/government/organisations/ministry-of-defence',
        u'/ips-regional-passport-office',
        u'/hand-luggage-restrictions/overview',
        u'/jobseekers-allowance/eligibility', u'/register-to-vote',
        u'/disclosure-barring-service-check/overview',
        u'/browse/benefits/jobseekers-allowance', u'/dvlaforms',
        u'/tier-4-general-visa', u'/student-finance/loans-and-grants'
    ]

    # the above should not be hardcoded. Calculate each time so as to avoid page name changes etc.

    max_results = 5e7
    metrics = ['pageviews']
    dimensions = ['pagePath', 'hour', 'date']
    dim = ['date', 'hour']
    filters = ['pagePath==' + top100[97]]

    ###############
    #Find Top 100 pages by pageviews - (pv fine in this case rather than upv)

    # df = ga.read_ga(metrics,
    # 					dim,
    # 					start_date = start,
    # 					end_date = end,
    # 					token_file_name = 'static/token/analytics.dat',
    # 					secrets = 'static/token/client_secrets.json',
    # 					account_id = '26179049',
    # 					max_results=max_results,
    #                     chunksize=5000
    # 			   )
    # df1b = pd.concat([x for x in df])
    # df1c = df1b.sort(columns=['pageviews'], ascending=0)

    df1 = ga.read_ga(metrics,
                     dimensions=dim,
                     start_date=dbyy,
                     end_date=yesterday,
                     parse_dates=[['date', 'hour']],
                     token_file_name='static/token/analytics.dat',
                     secrets='static/token/client_secrets.json',
                     account_id='26179049'
                     # 					 filters = filters
                     )

    ##################### 48 MAX LAG ##############################

    ind = []

    for i in range(48, len(df1)):

        lag = [1, 2, 3, 24, 48]
        lagx = list(i - np.array(lag))

        Train = df1.ix[lagx]
        Target = df1.ix[i]

        TT = Train.T
        TT.columns = [1, 2, 3, 24, 48]
        TT['Target'] = Target['pageviews']
        ind.append(TT)

    rng = date_range(df1.index[48], df1.index[len(df1) - 1], freq='H')
    Set = ind[0].append(ind[1:])
    Set.index = rng
    SetT = Set.ix[:today][:-1]
    print SetT

    ##################### 7 day trial ##############################

    # ind = []
    #
    # for i in range(168, len(df1)):
    #
    # 	lag = [1,2,3,24,48, 168]
    # 	lagx = list(i - np.array(lag))
    #
    # 	Train = df1.ix[lagx]
    # 	Target = df1.ix[i]
    #
    # 	TT = Train.T
    # 	TT.columns = [1,2,3,24,48, 168]
    # 	TT['Target'] = Target['pageviews']
    # 	ind.append(TT)
    #
    # rng = date_range(df1.index[168], df1.index[len(df1)-1], freq='H')
    # Set = ind[0].append(ind[1:])
    # Set.index = rng
    # SetT = Set.ix[:today][:-1]
    # print SetT

    #################################################
    TrainSamp = 0.8
    feats = 5
    TS = int(np.round(TrainSamp * len(SetT)))

    X_Train = SetT[SetT.columns[0:feats]].head(TS)
    Y_Train = SetT['Target'].head(TS)

    X_Test = SetT[SetT.columns[0:feats]].ix[TS:]
    Y_Test = SetT['Target'].ix[TS:]

    X_Train = X_Train.replace(0, 1)
    X_Test = X_Test.replace(0, 1)
    Y_Train = Y_Train.replace(0, 1)
    Y_Test = Y_Test.replace(0, 1)

    print 50 * '-'
    print "Random Forest Regression"
    print 50 * '-'
    rf = RandomForestRegressor(n_estimators=500, max_features=feats)
    rf.fit(X_Train, Y_Train)
    PredRF = rf.predict(X_Test)
    scoreRF = r2_score(Y_Test, PredRF)
    MSE = np.mean(np.square(Y_Test - PredRF))
    print 'R2 Score = ', scoreRF
    print 'MSE = ', MSE

    Res = pd.DataFrame(columns=['res'], index=(range(0, len(Y_Test))))
    resid = Y_Test - PredRF
    Res['res'] = resid.values

    TSDU = Res['res'].mean() + 3 * Res['res'].std()
    TSDL = Res['res'].mean() - 3 * Res['res'].std()
    tsdP = Res[Res['res'] > (Res['res'].mean() + 3 * Res['res'].std())]
    tsdN = Res[Res['res'] < (Res['res'].mean() - 3 * Res['res'].std())]

    plt.figure(2)
    plt.plot(Y_Test, Y_Test)
    plt.scatter(Y_Test, PredRF, s=40, alpha=0.5, c='r')

    ##############################
    plt.figure(1)

    plt.subplot(2, 2, 1)
    plt.plot(range(0, len(Y_Test)), Y_Test)
    plt.scatter(range(0, len(Y_Test)), PredRF, s=70, alpha=0.5, c='r')
    plt.xlim([0, len(Y_Test)])
    plt.title("Random Forest Model")

    plt.subplot(2, 2, 3)

    plt.plot(range(0, len(Y_Test)), resid)
    plt.plot(range(0, len(Y_Test)), [TSDU] * len(Y_Test), c='r')
    plt.plot(range(0, len(Y_Test)), [TSDL] * len(Y_Test), c='r')
    plt.xlim([0, len(Y_Test)])

    ###############################

    AP = len(tsdP) + len(tsdN)

    print 50 * '-'
    print "PAGE: " + str(filters[0])
    print 50 * '-'
    print "RANDOM FOREST Number of Anomalous Points is: %i" % AP
    print "%% Anomalous points is: %.1f" % (100 * AP / len(Y_Test)) + '%'

    #################################################################

    print 50 * '-'
    print "Linear Model"
    print 50 * '-'
    clf = linear_model.LinearRegression()
    clf.fit(np.log(X_Train), np.log(Y_Train))
    LMPred = clf.predict(np.log(X_Test))
    scoreLM = r2_score(np.log(Y_Test), LMPred)
    MSELM = np.mean(np.square(Y_Test - np.exp(LMPred)))
    print "R2 Score = ", scoreLM
    print "MSE = ", MSELM

    ###########################################################

    print 50 * '-'
    print "LASSO Regression"
    print 50 * '-'

    ################################################################################
    # Lasso with path and cross-validation using LassoCV path
    from sklearn.linear_model import LassoCV

    lasso_cv = LassoCV()

    y_ = lasso_cv.fit(X_Train, Y_Train).predict(X_Test)
    # .predict(X_Test)

    print "Optimal regularization parameter  = %s" % lasso_cv.alpha_

    # Compute explained variance on test data
    print "r^2 on test data : %f" % (
        1 - np.linalg.norm(Y_Test - y_)**2 / np.linalg.norm(Y_Test)**2)

    print "LASSO MSE: ", np.mean(np.square((Y_Test - (y_))))

    Res2 = pd.DataFrame(columns=['res'], index=(range(0, len(Y_Test))))
    resid2 = Y_Test - y_
    Res2['res'] = resid2.values

    TSDU2 = Res2['res'].mean() + 3 * Res2['res'].std()
    TSDL2 = Res2['res'].mean() - 3 * Res2['res'].std()
    tsdP2 = Res2[Res2['res'] > (Res2['res'].mean() + 3 * Res2['res'].std())]
    tsdN2 = Res2[Res2['res'] < (Res2['res'].mean() - 3 * Res2['res'].std())]

    AP = len(tsdP2) + len(tsdN2)

    print 50 * '-'
    print "PAGE: " + str(filters[0])
    print 50 * '-'
    print "LASSP Number of Anomalous Points is: %i" % AP
    print "%% Anomalous points is: %.1f" % (100 * AP / len(Y_Test)) + '%'

    # plt.figure(3) #################################################

    plt.subplot(2, 2, 2)
    plt.plot(range(0, len(Y_Test)), Y_Test)
    plt.scatter(range(0, len(Y_Test)), y_, s=70, alpha=0.5, c='r')
    plt.xlim([0, len(Y_Test)])
    plt.title('LASSO Model')

    plt.subplot(2, 2, 4)

    plt.plot(range(0, len(Y_Test)), resid2)
    plt.plot(range(0, len(Y_Test)), [TSDU2] * len(Y_Test), c='r')
    plt.plot(range(0, len(Y_Test)), [TSDL2] * len(Y_Test), c='r')
    plt.xlim([0, len(Y_Test)])