def sim_word_to_all(vectorspath, pivotwordvectors, identity, simfunction): filepath=target_dir_path+identity fr=open(vectorspath) pivotwords=pivotwordvectors.keys() keys=[] vectors=[] next(fr) cnt=0 logger.info('reading vectors in '+vectorspath) for line in fr: cnt+=1 line=line.strip() key=line[0:line.index(' ')] vector=map(float, line[line.index(' '):].strip().split()) vectors.append(vector) keys.append(key) if cnt%100000==0: logger.info('reading '+str(cnt)+' vectors of '+vectorspath) fr.close() wordspath=vectorspath+'.wordspkl' if not os.path.isfile(wordspath): with open(wordspath, 'wb') as handle: pickle.dump(keys, handle) for pivotword in pivotwords: logger.info('multipying '+pivotword+' to '+str(len(vectors))+' vectors') pivotvector=np.array(map(float,pivotwordvectors[pivotword].split())) simres=np.dot(np.array(vectors),pivotvector.T) simresstr=' '.join(map(str,simres)) tools._mkdir_recursive(target_dir_path+'/'+pivotword+'/') dest_path=target_dir_path+'/'+pivotword+'/'+identity+'.json' logger.info('saving '+pivotword+' to '+dest_path) wr=open(dest_path,'w') wr.write(json.dumps(simresstr)) wr.close()
logger.info('Starting...') argv=sys.argv[1:] simfunction=argv[0] outputcode=argv[1] simwordtype=argv[2] if simwordtype=='orig': simword_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simword/' target_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simwordspan/' elif values_ready[1]=='norm': simword_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simwordnorm/' target_dir_path='/data/nrekabsaz/similarity/result/randomness/'+simfunction+'/simwordspannorm/' tools._mkdir_recursive(target_dir_path) try: logger.info('simwordspan_calc ('+str(outputcode)+') '+simfunction+' started...') simwordspan_calc(outputcode) msgText='simwordspan_calc ('+str(outputcode)+') '+simfunction+' finished!' logger.info(msgText) except Exception as e: msgText='run_simwordspan_calc('+str(outputcode)+') '+simfunction+'\n'+traceback.format_exc() logger.error(msgText) mail.sendemail_error(msgText) logger.info('email sent!')
def extended_distribution(pivotword_i): pivotword=pivotwords[pivotword_i] logphrase=identity+'/'+pivotword try: logger.info(logphrase+' : starting') target_path='/data/nrekabsaz/experiments/randomness/'+simfunction+'/extendeddist/'+pivotword+'/'+identity+'.json' tools._mkdir_recursive(os.path.dirname(target_path)) simword_path='/data/nrekabsaz/experiments/randomness/'+simfunction+'/simword/'+pivotword+'/'+identity+'.json' logphrase=identity+'/'+pivotword logger.info(logphrase+' : '+'loading' + simword_path) with open(simword_path) as frr: data=json.load(frr) meanstd=data.values() logger.info(logphrase+' : '+'calculating meanstdtopdown') meanstdtopdown=[] for meanstd_tuple in meanstd: mean=meanstd_tuple[0] std=meanstd_tuple[1] topbrd=mean+3*std downbrd=mean-3*std meanstdtopdown.append((mean, std, topbrd, downbrd)) meanstdtopdown.sort(key=lambda x: x[2], reverse=True)#sort by topbrd meanstd=None logger.info(logphrase+' : '+'calculating mixture norms') bins = np.arange(1, -0.2, -.001) first_dropin_slide_cnt=0 mixpdflist=[] for bin_i, bin in enumerate(bins): mixpdf=0 if first_dropin_slide_cnt!=-1: meanstdtopdown_cnt=first_dropin_slide_cnt first_dropin_slide_cnt=-1 while (True): if meanstdtopdown_cnt>=len(meanstdtopdown): break meanstdtopdown_tuple=meanstdtopdown[meanstdtopdown_cnt] if meanstdtopdown_tuple[2]>=bin: if meanstdtopdown_tuple[3]<=bin: if first_dropin_slide_cnt==-1: first_dropin_slide_cnt=meanstdtopdown_cnt if meanstdtopdown_tuple[1]!=0: #normdist=norm(loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1]) #normdist=normdistlist[meanstdtopdown_cnt] if bin_i+1<len(bins): mixpdf+=(norm.cdf(bins[bin_i+1], loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1])- norm.cdf(bins[bin_i], loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1])) else: mixpdf+=norm.cdf(bins[bin_i], loc = meanstdtopdown_tuple[0], scale = meanstdtopdown_tuple[1]) else: mixpdf+=1.0 meanstdtopdown_cnt+=1 else: break mixpdflist.append(mixpdf) if bin_i%50==0: logger.info(logphrase+' : calculating mixture value '+str(bin_i)+'/'+str(len(bins))) logger.info(logphrase+' : saving into file') wr=open(target_path,'w') wr.write(' '.join(map(str, mixpdflist))) wr.close() except KeyboardInterrupt: raise KeyboardInterruptError() except Exception as e: msgText=identity+' : extended_distribution('+simfunction+') error!'+'\n'+traceback.format_exc() mail.sendemail_error(msgText) logger.error(msgText)
#parameters argv=sys.argv[1:] simfunction=argv[0] identity=argv[1] thread_no=int(argv[2]) #paths rootpath='/'.join(os.path.dirname(os.path.realpath(__file__)).split('/')[0:3])+'/' #logger formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') logger = logging.getLogger('simword') logger_file_path=rootpath+'mycode/research_n_analyse/log/randomness/'+datetime.now().strftime('cosine_%H_%M_%d_%m_%Y.log') tools._mkdir_recursive(os.path.dirname(logger_file_path)) file_hdlr = logging.FileHandler(logger_file_path) file_hdlr.setFormatter(formatter) logger.addHandler(file_hdlr) logger.setLevel(logging.DEBUG) console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) console_handler.setFormatter(formatter) logger.addHandler(console_handler) #start begin_time=datetime.now() logger.info('Starting...') #get pivotwords pivotwordvectors=json.load(open('vectors/vector200-1.txt.norm'))