def web_logo_creator(sequence_list, sequence_name, output): """ :param sequence_list: (tuple of strings) - list of sequences :param sequence_name: (string) name of the sequence :param output: (string) the folder where the results will be created """ warnings.filterwarnings("ignore", category=RRuntimeWarning) weblogo_maker = robj.r(""" library("ggplot2") library("ggseqlogo") function(mys_seq, name_file, mytitle, size){ s1 = 15 cs1 = make_col_scheme(chars=c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M'), groups=c('g1','g2','g3','g4','g5', 'g6', 'g7', 'g8', 'g9', 'g10'),cols=c('limegreen','brown1','gold','dodgerblue3','darkorange', "brown1", "limegreen", "dodgerblue3", "darkorchid3", "dodgerblue3"), name='custom1') p1 = ggseqlogo(mys_seq, method = "bit", col_scheme=cs1, namespace = c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M')) + theme_logo() + scale_x_discrete(limits = as.character(seq(1,size, by=1)), labels = as.character(seq(1,size, by=2)), breaks = as.character(seq(1, size, by=2))) + theme(axis.title.y=element_text(size=s1+25), legend.position="none") p1 = p1 + ggtitle(mytitle) + theme(plot.title = element_text(hjust = 0.5)) p1 = p1 + theme(axis.text=element_text(size=s1 + 25), plot.title = element_text(size=s1 + 30)) p1 = p1 + scale_y_discrete(limits = c(0, 0.5, 1), labels = as.character(seq(0,1, length=3)), breaks = as.character(seq(0,1, length=3)), expand = c(0,0.05)) #p1 = p1 + ylim(0,1) png(file=paste(name_file,"_weblogo.png", sep=""),height=149 * 2,width=52 * size * 2 ) print(p1) dev.off() } """) weblogo_maker(v.StrVector(sequence_list), v.StrVector([output + sequence_name]), v.StrVector([sequence_name]), v.IntVector([len(sequence_list[0])]))
def _dscquery(dsc_output, targets, conditions=None, verbose=True): dscrutils = importr('dscrutils') r_targets = rvec.StrVector(targets) if targets is not None else robj.NULL r_conditions = rvec.StrVector( conditions) if conditions is not None else robj.NULL dscoutr = dscrutils.dscquery(dsc_output, r_targets, conditions=r_conditions, verbose=verbose) with localconverter(robj.default_converter + pandas2ri.converter): dscout = robj.conversion.rpy2py(dscoutr) return dscout
def web_logo_creator(sequences, name_file, output): """ :param sequence_list: (tuple of 8 list of strings) - each list in the tuple corresponds to a list of sequence :param sequence_name: (list of string) each string identifies on list of sequence in sequence_list :param output: (string) the folder where the results will be created """ warnings.filterwarnings("ignore", category=RRuntimeWarning) weblogo_maker = robj.r(""" library("ggplot2") library("ggseqlogo") function(mys_seq, name_file, mytitle){ s1 = 15 cs1 = make_col_scheme(chars=c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M'), groups=c('g1','g2','g3','g4','g5', 'g6', 'g7', 'g8', 'g9', 'g10'),cols=c('limegreen','brown1','gold','dodgerblue3','darkorange', "brown1", "limegreen", "dodgerblue3", "darkorchid3", "dodgerblue3"), name='custom1') p1 = ggseqlogo(mys_seq, method = "probability", col_scheme=cs1, namespace = c('A','T','G','C', 'R', 'Y', 'W', 'S', 'K', 'M')) + theme_logo() + theme(axis.title.y=element_text(size=s1+25), legend.position="none") p1 = p1 + ggtitle(mytitle) + theme(plot.title = element_text(hjust = 0.5)) p1 = p1 + theme(axis.text=element_text(size=s1 + 50), plot.title = element_text(size=s1 + 60)) #p1 = p1 + ylim(0,1) png(file=name_file,height=300 * 2,width=400 * 2 ) print(p1) dev.off() } """) name = output + name_file weblogo_maker(v.StrVector(sequences), name, "")
def test_spread(self): labels = ('a', 'b', 'c', 'd', 'e') dataf = tidyr.DataFrame({ 'x': vectors.IntVector((1, 2, 3, 4, 5)), 'labels': vectors.StrVector(labels) }) dataf_spread = dataf.spread('labels', 'x') assert sorted(list(labels)) == sorted(list(dataf_spread.colnames))
def test_dataframe(self): dataf = tidyr.DataFrame({ 'x': vectors.IntVector((1, 2, 3, 4, 5)), 'labels': vectors.StrVector(('a', 'b', 'b', 'b', 'a')) }) assert isinstance(dataf, tidyr.DataFrame) assert sorted(['x', 'labels']) == sorted(list(dataf.colnames))
def make_count_set(conds, r): """ returns an r session with a new count data set loaded as cds """ r.assign('conds', vectors.StrVector.factor(vectors.StrVector(conds))) r(''' require('DESeq') cds = newCountDataSet(counts, conds) ''') return r
def make_count_set(conds, r): """ returns an r session with a new count data set loaded as cds """ #r.assign('conds', vectors.StrVector.factor(vectors.StrVector(conds))) r.assign('conds', vectors.StrVector(conds)) r(''' require('DSS') cds = newSeqCountSet(count_matrix, conds) ''') return r
def pval_getter(val, cell, reg): glm = rpy2.robjects.r(""" function(val, cell, reg){ data <- as.data.frame(cbind(val, cell, reg)) data$val <- as.factor(data$val) data$reg <- as.factor(data$reg) data$cell <- as.factor(data$cell) md0 <-glm(val ~ cell, family=binomial("logit"),data=data) md1 <-glm(val ~ reg+cell, family=binomial("logit"),data=data) print(summary(md0)) print(summary(md1)) a <- anova(md1, md0, test="Chisq") print(a) return(as.numeric(a$"Pr(>Chi)"[2])) } """) return(glm(v.IntVector(val), v.StrVector(cell), v.StrVector(reg)))
import pytest from rpy2.robjects import vectors from rpy2.robjects.packages import importr from rpy2.ipython import html base = importr('base') @pytest.mark.parametrize( 'o,func', [(vectors.IntVector([1, 2, 3]), html.html_vector_horizontal), (vectors.FloatVector([1, 2, 3]), html.html_vector_horizontal), (vectors.StrVector(['a', 'b' 'c']), html.html_vector_horizontal), (vectors.FactorVector(['a', 'b' 'c']), html.html_vector_horizontal), (vectors.ListVector({ 'a': 1, 'b': 2 }), html.html_rlist), (vectors.DataFrame({ 'a': 1, 'b': 'z' }), html.html_rdataframe), ('x <- c(1, 2, 3)', html.html_sourcecode), (base.c, html.html_ridentifiedobject)]) def test_html_func(o, func): res = func(o) assert isinstance(res, str)
__email__ = '*****@*****.**' __status__ = 'Dev' # Basic R utils r_base = rpack.importr("base") r_utils = rpack.importr("utils") r_stats = rpack.importr("stats") # Make sure that multispatialCCM is installed r_utils.chooseCRANmirror(ind=1) # select the first mirror in the list r_packnames = ('multispatialCCM', 'Rmisc') # Essential packages to_install = [x for x in r_packnames if not rpack.isinstalled(x)] if len(to_install) > 0: print("Installing Packages: {}".format(to_install)) r_utils.install_packages(rvec.StrVector(to_install)) mccm = rpack.importr("multispatialCCM") def make_test_data(rx_a=3.72, rx_b=3.72, b_ab=0.2, b_ba=0.01, t=20, obs=10, seed=12345): """ Creates a test data-set based on a coupled logistic map. This function generates a data-set suitable for testing pyMCCM. The data is generated from a coupled logistic map, which is known to be an accurate model for population dynamics between one predatory and one prey species that coexist within an ecosystem. The coupled logistic map is characterized by a shape parameter 'r' and a coupling strength 'b'. Args:
def create_imput_loyer(year): ''' Impute les loyers à partir de ??? ''' #Variables used for imputation df = DataCollection(year=year) print 'Démarrer 02_imput_loyer' menm_vars = [ "ztsam", "zperm", "zragm", "zricm", "zrncm", "zracm", "nb_uci", "wprm", "so", "nbpiec", "typmen5", "spr", "nbenfc", "agpr", "cstotpr", "nat28pr", "tu99", "aai1", 'ident', "pol99", "reg", "tau99" ] if year == 2008: # Tau99 not present menm_vars = menm_vars.pop('tau99') indm_vars = ["noi", 'ident', "lpr", "dip11"] LgtAdrVars = ["gzc2"] LgtMenVars = [ "sec1", "mrcho", "mrret", "mrsal", "mrtns", "mdiplo", "mtybd", "magtr", "mcs8", "maa1at", "qex", "muc1" ] if year == 2003: LgtMenVars.extend(["typse", "lmlm", "hnph2", "mnatior", "ident"]) LgtAdrVars.extend(["iaat", "tu99", "ident"]) if year < 2010 and year > 2005: LgtMenVars.extend(["mnatio", "idlog"]) LgtAdrVars.extend(["idlog"]) # pas de typse en 2006 LgtLgtVars = ["lmlm", "iaat", "tu99", "hnph2", "idlog"] # pas de typse en 2006 ## Travail sur la base ERF #Preparing ERF menages tables # print show_temp() # TODO : data.get_values erfmenm = load_temp(name="menagem", year=year) # erfmenm = df.get_values(table="erf_menage",variables=menm_vars) erfmenm['revtot'] = (erfmenm['ztsam'] + erfmenm['zperm'] + erfmenm['zragm'] + erfmenm['zricm'] + erfmenm['zrncm'] + erfmenm['zracm']) erfmenm['nvpr'] = erfmenm['revtot'].astype( np.float64) / erfmenm['nb_uci'].astype(np.float64) # On donne la valeur 0 aux nvpr négatifs tmp = np.zeros(erfmenm['nvpr'].shape, dtype=int) erfmenm['nvpr'] = max_(tmp, erfmenm['nvpr']) for v in erfmenm['nvpr']: # On vérifie qu'il n'y a plus de nvpr négatifs assert v >= 0, Exception('Some nvpr are negatives') erfmenm['logt'] = erfmenm['so'] l = erfmenm.columns.tolist() # print l #Preparing ERF individuals table erfindm = load_temp(name="indivim", year=year) # erfindm = df.get_values(table = "eec_indivi", variables = indm_vars) # TODO: clean this later erfindm['dip11'] = 0 count_NA('dip11', erfindm) # erfindm['dip11'] = 99 erfindm = erfindm[['ident', 'dip11']][erfindm['lpr'] == 1] # erf <- merge(erfmenm, erfindm, by ="ident") print('merging erf menage and individu') erf = erfmenm.merge(erfindm, on='ident', how='inner') erf = erf.drop_duplicates('ident') # control(erf) La colonne existe mais est vide, # on a du confondre cette colonne avec dip11 ? dec, values = mark_weighted_percentiles(erf['nvpr'], arange(1, 11), erf['wprm'], 2, return_quantiles=True) values.sort() erf['deci'] = (1 + (erf['nvpr'] > values[1]) + (erf['nvpr'] > values[2]) + (erf['nvpr'] > values[3]) + (erf['nvpr'] > values[4]) + (erf['nvpr'] > values[5]) + (erf['nvpr'] > values[6]) + (erf['nvpr'] > values[7]) + (erf['nvpr'] > values[8]) + (erf['nvpr'] > values[9])) # Problème : tous les individus sont soit dans le premier, soit dans le dernier décile. WTF assert_variable_inrange('deci', [1, 11], erf) count_NA('deci', erf) del dec, values gc.collect() #TODO: faire le lien avec men_vars, il manque "pol99","reg","tau99" et ici on a en plus logt, 'nvpr','revtot','dip11','deci' erf = erf[[ 'ident', 'ztsam', 'zperm', 'zragm', 'zricm', 'zrncm', 'zracm', 'nb_uci', 'logt', 'nbpiec', 'typmen5', 'spr', 'nbenfc', 'agpr', 'cstotpr', 'nat28pr', 'tu99', 'aai1', 'wprm', 'nvpr', 'revtot', 'dip11', 'deci' ]][erf['so'].isin(range(3, 6))] erf.rename(columns={ 'nbpiec': 'hnph2', 'nat28pr': 'mnatio', 'aai1': 'iaat', 'dip11': 'mdiplo' }, inplace=True) # TODO: ne traite pas les types comme dans R teste-les pour voir comment pandas les gère count_NA('agpr', erf) erf['agpr'] = erf['agpr'].astype('int64') # TODO: moche, pourquoi créer deux variables quand une suffit ? erf['tmp'] = 3 erf['tmp'][erf['agpr'] < 65] = 2 erf['tmp'][erf['agpr'] < 40] = 1 erf['magtr'] = erf['tmp'] count_NA('magtr', erf) assert_variable_inrange('magtr', [1, 4], erf) count_NA('cstotpr', erf) erf['tmp'] = erf['cstotpr'].astype('float') / 10.0 erf['tmp'] = map(math.floor, erf['tmp']) erf['mcs8'] = erf['tmp'] erf['mcs8'][erf['mcs8'] == 0] = NaN # assert isinstance(erf['mcs8'], (int, long)).all(), Exception('Some mcs8 are not integers') count_NA('mcs8', erf) # TODO il reste 41 NA's 2003 erf['mtybd'] = NaN erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] != 2)] = 1 erf['mtybd'][(erf['typmen5'] == 1) & (erf['spr'] == 2)] = 2 erf['mtybd'][erf['typmen5'] == 5] = 3 erf['mtybd'][erf['typmen5'] == 3] = 7 erf['mtybd'][erf['nbenfc'] == 1] = 4 erf['mtybd'][erf['nbenfc'] == 2] = 5 erf['mtybd'][erf['nbenfc'] >= 3] = 6 count_NA('mtybd', erf) # print erf['mtybd'].dtype.fields #assert_variable_inrange('mtybd', [1,7], erf) # bug, on trouve 7.0 qui fait assert # TODO : 3 logements ont 0 pièces !! erf['hnph2'][erf['hnph2'] < 1] = 1 erf['hnph2'][erf['hnph2'] >= 6] = 6 count_NA('hnph2', erf) assert_variable_inrange('hnph2', [1, 7], erf) # # TODO: il reste un NA 2003 # # il rest un NA en 2008 tmp = erf['mnatio'] tmp[erf['mnatio'] == 10] = 1 tmp[erf['mnatio'].isin([ 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 41, 42, 43, 44, 45, 46, 47, 48, 51, 52, 62, 60 ])] = 2 erf['mnatio'] = tmp count_NA('mnatio', erf) assert_variable_inrange('mnatio', [1, 3], erf) tmp = erf['iaat'] tmp[erf['mnatio'].isin([1, 2, 3])] = 1 tmp[erf['mnatio'] == 4] = 2 tmp[erf['mnatio'] == 5] = 3 tmp[erf['mnatio'] == 6] = 4 tmp[erf['mnatio'] == 7] = 5 tmp[erf['mnatio'] == 8] = 6 erf['iaat'] = tmp count_NA('iaat', erf) assert_variable_inrange('iaat', [1, 7], erf) # # Il reste un NA en 2003 # # reste un NA en 2008 # table(erf$iaat, useNA="ifany") # TODO: comparer logement et erf pour ?tre sur que cela colle tmp = erf['mdiplo'] tmp[erf['mdiplo'].isin([71, ""])] = 1 tmp[erf['mdiplo'].isin([70, 60, 50])] = 2 tmp[erf['mdiplo'].isin([41, 42, 31, 33])] = 3 tmp[erf['mdiplo'].isin([10, 11, 30])] = 4 erf['mdiplo'] = tmp count_NA('mdiplo', erf) #assert_variable_inrange('mdiplo', [1,5], erf) # On a un 99 qui se balade tmp = erf['tu99'] tmp[erf['tu99'] == 0] = 1 tmp[erf['tu99'].isin([1, 2, 3])] = 2 tmp[erf['tu99'].isin([4, 5, 6])] = 3 tmp[erf['tu99'] == 7] = 4 tmp[erf['tu99'] == 8] = 5 erf['tu99_recoded'] = tmp count_NA('tu99_recoded', erf) assert_variable_inrange('tu99_recoded', [1, 6], erf) # TODO : 0 ? Rajouetr 2003 ! tmp = erf['mcs8'] tmp[erf['mcs8'] == 1] = 1 tmp[erf['mcs8'] == 2] = 2 tmp[erf['mcs8'] == 3] = 3 tmp[erf['mcs8'].isin([4, 8])] = 4 tmp[erf['mcs8'].isin([5, 6, 7])] = 5 erf['mcs8'] = tmp count_NA('mcs8', erf) assert_variable_inrange('mcs8', [1, 6], erf) erf['wprm'] = erf['wprm'].astype('int64') count_NA('wprm', erf) del (erf['cstotpr'], erf['agpr'], erf['typmen5'], erf['nbenfc'], erf['spr'], erf['tmp'], erf['tu99']) gc.collect() erf = erf.dropna(subset=[ 'logt', 'magtr', 'mcs8', 'mtybd', 'hnph2', 'mnatio', 'iaat', 'mdiplo', 'tu99_recoded' ]) #On vérifie au final que l'on n'a pas de doublons d'individus assert erf['ident'].value_counts().max() == 1, Exception( 'Number of distinct individuals after removing duplicates is not correct' ) ## Travail sur la table logement # Table menage if year == 2003: year_lgt = 2003 if year > 2005 and year < 2010: year_lgt = 2006 print "preparing logement menage table" # Lgtmen = load_temp(name = "indivim",year = year) # Je rajoute une étape bidon Lgtmen = df.get_values(table="lgt_menage", variables=LgtMenVars) Lgtmen.rename(columns={'idlog': 'ident'}, inplace=True) count_NA('mrcho', Lgtmen) Lgtmen['mrcho'].fillna(0, inplace=True) Lgtmen['mrret'].fillna(0, inplace=True) Lgtmen['mrsal'].fillna(0, inplace=True) Lgtmen['mrtns'].fillna(0, inplace=True) count_NA('mrcho', Lgtmen) Lgtmen['revtot'] = Lgtmen['mrcho'] + Lgtmen['mrret'] + Lgtmen[ 'mrsal'] + Lgtmen['mrtns'] # Virer les revenus négatifs ? count_NA('revtot', Lgtmen) Lgtmen['nvpr'] = 10.0 * Lgtmen['revtot'] / Lgtmen['muc1'] count_NA('qex', Lgtmen) dec, values = mark_weighted_percentiles(Lgtmen['nvpr'], arange(1, 11), Lgtmen['qex'], 2, return_quantiles=True) values.sort() Lgtmen['deci'] = ( 1 + (Lgtmen['nvpr'] > values[1]) + (Lgtmen['nvpr'] > values[2]) + (Lgtmen['nvpr'] > values[3]) + (Lgtmen['nvpr'] > values[4]) + (Lgtmen['nvpr'] > values[5]) + (Lgtmen['nvpr'] > values[6]) + (Lgtmen['nvpr'] > values[7]) + (Lgtmen['nvpr'] > values[8]) + (Lgtmen['nvpr'] > values[9])) del dec, values print Lgtmen['deci'].describe() gc.collect() ##Table logement (pas en 2003 mais en 2006) # str(lgtmen) # if (year_lgt=="2006"){ # message("preparing logement logement table") # lgtlgt <- LoadIn(lgtLgtFil,lgtLgtVars) # lgtlgt <- upData(lgtlgt, rename=renameidlgt) # lgtmen <- merge(lgtmen, lgtlgt, by.x="ident", by.y="ident") if year_lgt == 2006: print 'preparing logement logement table' lgtlgt = df.get_values(table="lgt_logt", variables=LgtLgtVars) lgtlgt.rename(columns={'idlog': 'ident'}, inplace=True) Lgtmen = Lgtmen.merge(lgtlgt, left_on='ident', right_on='ident', how='inner') del lgtlgt data = Lgtmen[Lgtmen['sec1'].isin([21, 22, 23, 24, 30])] del Lgtmen gc.collect() if year_lgt == 2006: data.rename(columns={'mnatio': 'mnatior'}, inplace=True) data = (data[data['mnatior'].notnull()]) data = (data[data['sec1'].notnull()]) data['tmp'] = data['sec1'].astype(np.int64) data['tmp'][data['sec1'].isin([21, 22, 23])] = 3 data['tmp'][data['sec1'] == 24] = 4 data['tmp'][data['sec1'] == 30] = 5 data['logt'] = data['tmp'] count_NA('logt', data) data = (data[data['logt'].notnull()]) Lgtmen = data # ## Table adresse print "preparing logement adresse table" # lgtadr <- LoadIn(lgtAdrFil,lgtAdrVars) # lgtadr <- upData(lgtadr, rename=renameidlgt) # Je rajoute une étae bidon Lgtadr = df.get_values(table="adresse", variables=LgtAdrVars) Lgtadr.rename(columns={'idlog': 'ident'}, inplace=True) print('Merging logement and menage tables') Logement = Lgtmen.merge(Lgtadr, on='ident', how='inner') # control(Logement) # Pas de idfoy, etc. dans la table logement ? Logement['hnph2'][Logement['hnph2'] >= 6] = 6 Logement['hnph2'][Logement['hnph2'] < 1] = 1 count_NA('hnph2', Logement) assert not Logement['hnph2'].isnull().any(), "Some hnph2 are null" # Logement=(Logement[Logement['hnph2'].notnull()]) # Mis en comment car 0 NA pour hnph2 # On est dans la même étape within ici et par la suite ( cf code R ) # TODO : ici problème je transforme les 07 en 7 # car Python considère les 0n comme des nombres octaux ( < 08 ). # J'espère que ce n'est pas important. Logement['tmp'] = Logement['mnatior'] Logement['tmp'][Logement['mnatior'].isin([0, 1])] = 1 Logement['tmp'][Logement['mnatior'].isin([2, 3, 4, 5, 6, 7, 8, 9, 10, 11])] = 2 Logement['mnatior'] = Logement['tmp'] count_NA('mnatior', Logement) assert_variable_inrange('mnatior', [1, 3], Logement) Logement['tmp'] = Logement['iaat'] Logement['tmp'][Logement['iaat'].isin([1, 2, 3, 4, 5])] = 1 Logement['tmp'][Logement['iaat'] == 6] = 2 Logement['tmp'][Logement['iaat'] == 7] = 3 Logement['tmp'][Logement['iaat'] == 8] = 4 Logement['tmp'][Logement['iaat'] == 9] = 5 Logement['tmp'][Logement['iaat'] == 10] = 6 # TODO question Clément : et le 9 et le 10 ? Logement['iaat'] = Logement['tmp'] count_NA('iaat', Logement) assert_variable_inrange('iaat', [1, 7], Logement) Logement['tmp'] = Logement['mdiplo'] Logement['tmp'][Logement['mdiplo'] == 1] = 1 Logement['tmp'][Logement['mdiplo'].isin([2, 3, 4])] = 2 Logement['tmp'][Logement['mdiplo'].isin([5, 6, 7, 8])] = 3 Logement['tmp'][Logement['mdiplo'] == 9] = 4 Logement['mdiplo'] = Logement['tmp'] count_NA('mdiplo', Logement) assert_variable_inrange('mdiplo', [1, 5], Logement) Logement['tmp'] = Logement['mtybd'] Logement['tmp'][Logement['mtybd'] == 110] = 1 Logement['tmp'][Logement['mtybd'] == 120] = 2 Logement['tmp'][Logement['mtybd'] == 200] = 3 Logement['tmp'][Logement['mtybd'].isin([311, 321, 401])] = 4 Logement['tmp'][Logement['mtybd'].isin([312, 322, 402])] = 5 Logement['tmp'][Logement['mtybd'].isin([313, 323, 403])] = 6 Logement['tmp'][Logement['mtybd'] == 400] = 7 Logement['mtybd'] = Logement['tmp'] count_NA('mtybd', Logement) assert_variable_inrange('mtybd', [1, 8], Logement) Logement['tmp'] = Logement['tu99'] count_NA('tu99', Logement) Logement['tmp'][Logement['tu99'] == 0] = 1 Logement['tmp'][Logement['tu99'].isin([1, 2, 3])] = 2 Logement['tmp'][Logement['tu99'].isin([4, 5, 6])] = 3 Logement['tmp'][Logement['tu99'] == 7] = 4 Logement['tmp'][Logement['tu99'] == 8] = 5 Logement['tu99_recoded'] = Logement['tmp'] count_NA('tu99_recoded', Logement) assert_variable_inrange('tu99_recoded', [1, 6], Logement) Logement['tmp'] = Logement['gzc2'] Logement['tmp'][Logement['gzc2'] == 1] = 1 Logement['tmp'][Logement['gzc2'].isin([2, 3, 4, 5, 6])] = 2 Logement['tmp'][Logement['gzc2'] == 7] = 3 Logement['gzc2'] = Logement['tmp'] count_NA('gzc2', Logement) assert_variable_inrange('gzc2', [1, 4], Logement) Logement['tmp'] = Logement['magtr'] Logement['tmp'][Logement['magtr'].isin([1, 2])] = 1 Logement['tmp'][Logement['magtr'].isin([3, 4])] = 2 Logement['tmp'][Logement['magtr'] == 5] = 3 Logement['magtr'] = Logement['tmp'] count_NA('magtr', Logement) assert_variable_inrange('magtr', [1, 4], Logement) Logement['tmp'] = Logement['mcs8'] Logement['tmp'][Logement['mcs8'] == 1] = 1 Logement['tmp'][Logement['mcs8'] == 2] = 2 Logement['tmp'][Logement['mcs8'] == 3] = 3 Logement['tmp'][Logement['mcs8'].isin([4, 8])] = 4 Logement['tmp'][Logement['mcs8'].isin([5, 6, 7])] = 5 Logement['mcs8'] = Logement['tmp'] count_NA('mcs8', Logement) assert_variable_inrange('mcs8', [1, 6], Logement) Logement['logloy'] = Logement['lmlm'].apply(lambda x: math.log(x)) Logement = (Logement[Logement['mdiplo'].notnull()]) Logement = (Logement[Logement['mtybd'].notnull()]) Logement = (Logement[Logement['magtr'].notnull()]) Logement = (Logement[Logement['mcs8'].notnull()]) Logement = (Logement[Logement['maa1at'].notnull()]) ## Imputation des loyers proprement dite # library(StatMatch) # loads StatMatch # # library(mice) use md.pattern to locate missing data # TODO : à supprimer ? # logt <- subset(logement,select=c(lmlm,logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, ident)) # logt$wprm <- logement$qex # erf <- subset(erf,select=c( logt , hnph2 , iaat , mdiplo , mtybd , tu99_recoded , magtr , mcs8 , deci, wprm, ident)) print('Compute imputed rents') Logt = Logement[[ 'lmlm', 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'ident' ]] Logt['wprm'] = Logement['qex'] erf = erf[[ 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci', 'wprm', 'ident' ]] # # debug # # derf <- describe(erf, weights=as.numeric(erf$wprm)) # # dlogt <- describe(logt, weights=logt$wprm) # # # # for (var in as.list(names(derf))){ # # print("erf") # # print(derf[[var]]) # # print("logt") # # print(dlogt[[var]]) # # print("================") # # } # TODO add md.pattern # erf1 <- na.omit(erf) # logt <- na.omit(logt) from pandas import DataFrame erf = erf.dropna( how='any' ) # Si j'ai bien compris ce que l'on fait en R : dropper les lignes avec des NA #erf1 = erf # A-t-on toujours besoin de changer le nom du coup ? Logt = Logt.dropna(how='any') # allvars <- c("logt", "hnph2", "iaat", "mdiplo", "mtybd", "tu99_recoded", "magtr", "mcs8", "deci") # classes <- c("magtr","tu99_recoded") # matchvars <- setdiff(allvars,classes) allvars = [ 'logt', 'hnph2', 'iaat', 'mdiplo', 'mtybd', 'tu99_recoded', 'magtr', 'mcs8', 'deci' ] classes = ['magtr', 'tu99_recoded'] matchvars = list(set(allvars) - set(classes)) erf['mcs8'] = erf['mcs8'].astype(int) # out.nnd <- NND.hotdeck(data.rec=erf1,data.don=logt,match.vars=matchvars,don.class=classes,gdist.fun="Gower") # fill.erf.nnd <- create.fused(data.rec=erf1, data.don=logt,mtc.ids=out.nnd$mtc.ids, z.vars="lmlm") from rpy2.robjects.packages import importr import rpy2.robjects.pandas2ri import rpy2.robjects.vectors as vectors rpy2.robjects.pandas2ri.activate( ) # Permet à rpy2 de convertir les dataframes sm = importr( "StatMatch") #, lib_loc = "C:\Program Files\R\R-2.15.2\library") print 'TEST 2' out_nnd = sm.NND_hotdeck(data_rec=erf, data_don=Logt, match_vars=vectors.StrVector(matchvars), don_class=vectors.StrVector(classes), dist_fun="Gower") print 'TEST 3' fill_erf_nnd = sm.create_fused(data_rec=erf, data_don=Logt, mtc_ids=out_nnd[0], z_vars=vectors.StrVector(["lmlm"])) del allvars, matchvars, classes, out_nnd gc.collect() # fill.erf.nnd <- upData(fill.erf.nnd, rename=c(lmlm='loym')) import pandas.rpy.common as com fill_erf_nnd = com.convert_robj(fill_erf_nnd) fill_erf_nnd = DataFrame(fill_erf_nnd) (fill_erf_nnd).rename(columns={'lmlm': 'loym'}, inplace=True) # loy_imput = fill.erf.nnd[c('ident','loym')] loy_imput = (fill_erf_nnd)[['ident', 'loym']] # load(menm) # menagem$loym <- NULL # menagem <- merge(menagem,loy_imput,by='ident',all.x = TRUE) # save(menagem,file=menm) # Mis en comment block, car à manipuler avec précaution je suppose ( ne souhaite pas faire de conneries ) erfmenm = load_temp(name="menagem", year=year) # del erfmenm['loym'] erfmenm = erfmenm.merge(loy_imput, on='ident', how='left') assert 'loym' in erfmenm.columns, 'No loym in erfmenm columns' save_temp(erfmenm, name="menagem", year=year)