def fileFreq(text,dic): #get the word frequency for given texts using extracted dictionary tf_file=defaultdict(int) tf=wordFreq(text,stopword=stop,l=1) for w in tf: if w in dic: tf_file[w]=tf_file[w]+1 return tf_file
def BOWFeature(json_files, dicPath, feature_path=''): dic=loadDict(dicPath);label=0 dic_list=list(dic) writer=open(feature_path,'w') worg=open(feature_path+'_org','w') for cat in os.listdir(base_path): if not os.path .isdir(base_path+'/'+cat): continue for f in os.listdir(base_path+'/'+cat+'/jsons'): if not os.path.isfile(base_path+'/'+cat+'/jsons/'+f): continue jtexts = loadJson(base_path+'/'+cat+'/jsons/'+f) jtexts_lower=dict((k.lower(), v) for k, v in jtexts.iteritems()) if not 'caption' in jtexts_lower: continue tf=wordFreq(jtexts_lower['caption'].lower(), stopword=stop,l=1) fn=os.path.splitext(f)[0] worg.write(fn+str(label)+'\t'+'\t'+'\t'.join(w.encode('utf-8')+'\t'+str(tf[w]) for w in tf if w in dic_list)+'\n') writer.write(fn+'\t'+str(label)+'\t'+'\t'.join(str(tf[w]) for w in dic)+'\n') label=label+1 writer.close() worg.close() print 'BAG OF TEXTUAL FEATURE GENERATED'
def dicExt(path_list):#read all files under the list to generate dictionary tf=defaultdict(lambda:defaultdict(int)) frequency = defaultdict(lambda:defaultdict(int)) label=0 count=0 clsize=[] for cat in os.listdir(base_path): if not os.path .isdir(base_path+'/'+cat): continue count=0 for f in os.listdir(base_path+'/'+cat+'/jsons'): if not os.path.isfile(base_path+'/'+cat+'/jsons/'+f): continue jtexts = loadJson(base_path+'/'+cat+'/jsons/'+f) jtexts_lower=dict((k.lower(), v) for k, v in jtexts.iteritems()) if not 'caption' in jtexts_lower: continue tf_file=wordFreq(jtexts_lower['caption'].lower(),stopword=stop,l=1) for w in tf_file: frequency[label][w]=frequency[label][w]+1 count=count+1 clsize.append(count) label=label+1 return frequency, clsize