예제 #1
0
def fast_ngram_counter(name_bdd,concept_list=''):	
	Nb_rows = fonctions_bdd.count_rows(name_bdd,'billets')
	size_seq = 5000
	nb_sequences = Nb_rows/size_seq
	dictionnaire_gramme = {}#initialisation du dictionnaire de lemmes
	billetprocessed_after_requete=0 #counts the number of processed posts
	import multiprocessing
	pool_size = min(nb_sequences+1,multiprocessing.cpu_count()*4)
	pool = multiprocessing.Pool(processes=pool_size)
	inputs=[]
	for x in range(nb_sequences+1):
		inputs.append((x,size_seq,Nb_rows,sample,nb_sequences,concept_list,name_bdd))
	pool_outputs = pool.map(fast_ngram_counter_x, inputs)	
	dictionnaire_gramme={}
	for dictionnaire_gramme_x in pool_outputs:
		dictionnaire_gramme=fonctions_lib.merge(dictionnaire_gramme, dictionnaire_gramme_x, lambda x,y:x+y)
	if concept_list=='':
		dictionnaire_gramme = misc.freq_tri(dictionnaire_gramme,freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire
	return dictionnaire_gramme
예제 #2
0
	for y,x in enumerate(pool_outputs):		
		dictionnaire_gramme_year[y]=x
	fonctions_lib.dumpingin(dictionnaire_gramme_year,name_export_pkl,requete)


#decoupage par periode:
print dictionnaire_gramme_year.keys()
#puis on itere annee par annee
try: 
	os.mkdir(path_req +'years/')
except:
	pass
for y,year in enumerate(years):
	#on trie par fréquence et on exporte le lexique final avec les occurrences 
	print '\n'
	print year
	
	dico_final = misc.freq_tri(dictionnaire_gramme_year[y],freqmin,int(math.floor(top*1.1)),language,ng_filter)#on effectue le tri de notre dictionnaire
	filename = path_req +'years/'+ requete + '_' + str(freqmin) + '_' +str(year) + '_'+ 'liste_n-grammes_freq_divers.csv'
	filename_redond =  path_req +'years/'+ requete + '_' + str(freqmin) +str(year) + '_'+ 'liste_n-grammes_freq_divers_noredond.csv'
	filename_redond_leven =  path_req +'years/'+ requete + '_' + str(freqmin)+str(year) + '_' 'liste_n-grammes_freq_divers_leven_noredond.csv'
	misc.ecrire_liste_lemmes_freq(dico_final,Nb_rows,filename,lemme_maj,freqmin,ng_filter)#on ecrit la liste precedente dans un fichier filename
	print "\n+++"+str(len(dico_final))+" n-lemmes crees."
	#leven.pack_rendondance(filename,filename_redond,maxTermLength,freqmin,language,redondance_manuelle,ng_filter,user_interface)
	leven.pack_rendondance_exact(filename,filename_redond,maxTermLength,freqmin,language,ng_filter,user_interface)
	print "\n"
	Nb_rows = fonctions_bdd.count_rows_where(name_bdd,'billets'," where jours IN ('" + "','".join(list(map(str,year))) + "') ")
	print Nb_rows
	leven.pack_leven(filename_redond,filename_redond_leven,language,user_interface,freqmin,Nb_rows)

fusion_years.fusion('redond')