def fileFreq(text,dic): #get the word frequency for given texts using extracted dictionary
    tf_file=defaultdict(int)
    tf=wordFreq(text,stopword=stop,l=1)
    for w in tf:
        if w in dic:
            tf_file[w]=tf_file[w]+1
    return tf_file
def BOWFeature(json_files, dicPath, feature_path=''):
    dic=loadDict(dicPath);label=0
    dic_list=list(dic)
    writer=open(feature_path,'w')
    worg=open(feature_path+'_org','w')
    for cat in os.listdir(base_path):
    	if not os.path .isdir(base_path+'/'+cat):
    	    continue
    	for f in os.listdir(base_path+'/'+cat+'/jsons'):
            if not os.path.isfile(base_path+'/'+cat+'/jsons/'+f): continue
            jtexts = loadJson(base_path+'/'+cat+'/jsons/'+f)
            jtexts_lower=dict((k.lower(), v) for k, v in jtexts.iteritems())
            if not 'caption' in jtexts_lower: continue
            tf=wordFreq(jtexts_lower['caption'].lower(), stopword=stop,l=1)
            fn=os.path.splitext(f)[0]
            
            worg.write(fn+str(label)+'\t'+'\t'+'\t'.join(w.encode('utf-8')+'\t'+str(tf[w]) for w in tf if w in dic_list)+'\n')
            writer.write(fn+'\t'+str(label)+'\t'+'\t'.join(str(tf[w]) for w in dic)+'\n')
        label=label+1
    writer.close()
    worg.close()
    print 'BAG OF TEXTUAL FEATURE GENERATED'
def dicExt(path_list):#read all files under the list to generate dictionary
    tf=defaultdict(lambda:defaultdict(int))
    frequency = defaultdict(lambda:defaultdict(int))
    label=0
    count=0
    clsize=[]
    for cat in os.listdir(base_path):
    	if not os.path .isdir(base_path+'/'+cat):
    	    continue
    	count=0
    	for f in os.listdir(base_path+'/'+cat+'/jsons'):
            if not os.path.isfile(base_path+'/'+cat+'/jsons/'+f): continue
            jtexts = loadJson(base_path+'/'+cat+'/jsons/'+f)
            jtexts_lower=dict((k.lower(), v) for k, v in jtexts.iteritems())
            if not 'caption' in jtexts_lower: continue
            tf_file=wordFreq(jtexts_lower['caption'].lower(),stopword=stop,l=1)
            for w in tf_file:
                frequency[label][w]=frequency[label][w]+1
            count=count+1
        clsize.append(count)
        label=label+1
    
        
    return frequency, clsize