def generateHeadings(OffenceArray=c2n.offcat): """Generate the pairwise comparisons on an array, OffenceArray Returns ------- array of strings """ OffenceArrayLen=len(OffenceArray) HeadingArray=[] for ci,C in enumerate(OffenceArray): for P in range(ci+1,OffenceArrayLen): HeadingArray.append(c2n.upcaseFirstLetter(C) +c2n.upcaseFirstLetter(OffenceArray[P])) return HeadingArray
def deathornot(CatEmp): """Create the raw data for death or not as punishment per offence. Use generateDependentModelLaplace(deathornot) to generate the probability estimates """ #Grouping arrays, statically as it is easier to understand. 63 columns, combine by category for Death and Not: Groupings = [ c2n.upcaseFirstLetter(x)+y for x in c2n.offcat for y in ['Not']*2 + ['Death'] + ['Not']*4] DeathOrNotEmp = CatEmp.groupby(Groupings,axis=1,sort=False).sum() return DeathOrNotEmp
def validatePartitioning(): """ Validate the partition searching. Parameters ---------- DumyEmp : pandas DataFrame, Dummy emperical data frame as generated by generateDummyDataFrame() BestPartition: bool, Whether to return the partition AIC scores or the best partition. Returns: pandas DataFrame, Partition AIC scores, or the best partition Try also: x = [15,85,44,56,49,51,46,54,50,50,50,50,56,44,46,54,4,96] NNNNope """ #Death Or Not test #For partitions of typ: A, B, A, C, A, A, A, C, A, where we should get # Which is [[0,2,4,5,6,8],[1],[3,7]] #A = [ 'breakingPeace','deception', 'miscellaneous', 'royalOffences', 'sexual', 'violentTheft'] #B = [ 'damage' ] #C = [ 'kill', 'theft'] # We also ensure that the occurence of the offences are different within the same partition TestRow = [40, 160, 70, 30, 20, 80, 50, 50, 20, 80, 20, 80, 10, 40, 100, 100, 20, 80] #For not Death or Not, have 7 punishmnets A = [ 10, 20, 30, 40, 50, 60, 70 ] B = [ 70, 60, 50, 40, 30, 20, 10 ] C = [ 20, 10, 40, 30, 60, 50, 70 ] #For 9 offences #A, B, A, C, A, A, A, C, A as above TestFullRowA = A+listMul(B,2)+listMul(A,3)+C+A+A+listMul(A,2)+C+A #C, A, A, B, B, C, A, A, C TestFullRowB = C+A+listMul(A,2)+B+listMul(B,3)+C+A+listMul(A,3)+C TestFrame = pd.DataFrame([TestRow,TestRow], columns=list(range(18)), index=[0,1]) TestFullFrame = pd.DataFrame([TestFullRowA,TestFullRowA,TestFullRowB,TestFullRowB,TestFullRowB], columns=list(range(63)), index=list(range(5))) partitions = partition.Partition([c2n.upcaseFirstLetter(x) for x in c2n.offcat]) print('Testing Death Or Not partitioning') DeathAICtable = oboP.generateAICtable(TestFrame) DeathAICmin = DeathAICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)]) print('Found minimal partitions:') print(DeathAICmin) print('Testing full partitioning') AICtable = oboP.generateAICtable(TestFullFrame) AICmin = AICtable.idxmin(axis=1).apply(lambda x: partitions[int(x)]) print('Found minimal partitions:') print(AICmin)
def generateRowRange(StartDate, EndDate, Gender='None', http='None',Sockets=2): """Generate one row of data for a given date and defendent gender We are not atomising the code to generalise URL gets such that the socket opened by urllib3 can stay open. This may be messy looking... """ import CategoryToNumberAssignment as c2n import urllib3 import json if http is 'None': http=urllib3.PoolManager(maxsize=Sockets) sStartDate = str(StartDate) sEndDate = str(EndDate) _Columns = c2n.generateCategories() #Generate empty columns Row = [sStartDate] """Get not breakdown of punishments by category and subcategory for the period""" # Categories for Category in c2n.offcat: _TempCategories = initialiseEmptyCatArray() #Get the Json data #print("Generating URL: {}".format(generateURLCategoryRange(sStartDate,sEndDate,Category, Gender))) _Json = URLtoJSON(generateURLCategoryRange(sStartDate,sEndDate,Category, Gender),http) #Find the punishment totals and place in the correct position in the array # adding 1 place for the Not guilty for Totals in _Json['breakdown']: _TempCategories[c2n.puncat.index(Totals['term'])+1]=Totals['total'] # Append _Temps to the Rows Row = Row + _TempCategories #Associate Offence Subcategories with Punishment Subategories for Category in c2n.offsubcat: _TempCategories = initialiseEmptySubCatArray() #Get the Json data _Json = URLtoJSON(generateURLSubCategoryRange(sStartDate,sEndDate,Category, Gender),http) #Find the punishment totals and place in the correct position in the array # adding 1 place for the Not guilty for Totals in _Json['breakdown']: _TempCategories[c2n.punsubcat.index(Totals['term'])+1]=Totals['total'] # Append _Temps to the Rows Row = Row + _TempCategories """Get not guilties by category and subcategory for the period""" #Get not guilties: _JsonNotGuiltyCat = URLtoJSON(generateURLCategoryNotGuiltyRange(sStartDate,sEndDate,Gender),http) _JsonNotGuiltySubCat = URLtoJSON(generateURLSubCategoryNotGuiltyRange(sStartDate,sEndDate,Gender),http) # Place values associated with locations in Row: for Totals in _JsonNotGuiltyCat['breakdown']: Row[_Columns.index(c2n.upcaseFirstLetter(Totals['term'])+'NotGuilty')] = Totals['total'] for Totals in _JsonNotGuiltySubCat['breakdown']: Row[_Columns.index(c2n.upcaseFirstLetter(Totals['term'])+'NotGuilty')] = Totals['total'] return Row
import pandas as pd import numpy as np from ast import literal_eval from partitionsets import partition import CategoryToNumberAssignment as c2n import OBOModelling as oboM #The following dependency is from CythonGSL from https://github.com/twiecki/CythonGSL # The interfaces must be installed as: # sudo python3 setup_interface.py install # in the CythonGSL directory. (you will need gcc and libgsl-dev or equivalent installed) import probability_distribution as gslPDD Deltas = [1,2,3,4,5,10,50,100,240] #Initilise partitions: partitions = partition.Partition([c2n.upcaseFirstLetter(x) for x in c2n.offcat]) partitioN = partition.Partition(list(range(0,9))) def partitionAIC(EmpFrame, part, OffenceEstimateFrame = [], ReturnDeathEstimate=False, BlockPunishment='Death', Verbose=True): """Calculate AIC score between the EmpFrame and the model where offences are partitioned as `part'. Parameters: ----------- EmpFrame : DataFrame DataFrame of emperical data, pre processed, maybe. part : nested list, 2 levels Partition formatted as: [[0, 3], [1, 2, 6, 7], [4, 5, 8]] ReturnDeathEstimate : bool Whether to return the DeathEstimate frame Returns: