def join(): """Join a network with the given name (SSID) and password""" print "SWIFT JOIN" print request.json #return json.dumps({"success": True}) # First we need to fish that information out of the JSON that was sent along with this d = request.body.read() print d if len(d) == 0: return json.dumps({"success": False}) try: dj = json.loads(d) except: print "Bad JSON data, aborting..." return json.dumps({"success": False}) if 'ssid' in dj: ssid_value = dj['ssid'] else: print "Missing SSID" return json.dumps({"success": False}) if 'password' in dj: passwd = dj['password'] # Salted, I hope else: print "Missing password" return json.dumps({"success": False}) # And here we should be able to join the network maybe print "Joining %s..." % ssid_value joiner.join(ssid_value,passwd) return json.dumps({"success": True})
def join(): """Join a network with the given name (SSID) and password""" print "SWIFT JOIN" print request.json #return json.dumps({"success": True}) # First we need to fish that information out of the JSON that was sent along with this d = request.body.read() print d if len(d) == 0: return json.dumps({"success": False}) try: dj = json.loads(d) except: print "Bad JSON data, aborting..." return json.dumps({"success": False}) if 'ssid' in dj: ssid_value = dj['ssid'] else: print "Missing SSID" return json.dumps({"success": False}) if 'password' in dj: passwd = dj['password'] # Salted, I hope else: print "Missing password" return json.dumps({"success": False}) # And here we should be able to join the network maybe print "Joining %s..." % ssid_value joiner.join(ssid_value, passwd) return json.dumps({"success": True})
def harmari_craigslist_parsing(html_file, meta_file): print('harmari_craigslist_parsing', html_file, meta_file) join_file = meta_file + '_join.csv' if os.path.exists(join_file): print('file extracted:', join_file) return print(' 1. start index..') if not os.path.exists(html_file + '_tag'): run_cpp('find_start.cpp', [html_file]) print(' 2. start extract..') mkdir('html') mkdir('otherAttributes') run_cpp('extract.cpp', [html_file]) print(' 3. count records..') n_records = run_cpp('lc.cpp', [meta_file], True).strip() print(' 4. parse html..') to_parse = html_parse(','.join([html_file, n_records])) print(' 5. join html and metadata..') join([html_file, meta_file, n_records]) print('done')
def kclustering(top=100, pca=0): training = pd.read_csv('documents\csv\drunk\drunk labeling 1300' + '.csv') test = pd.read_csv('documents\csv\drunk\drunkTEXT400U' + '.csv') main_domain = join(training, 'Clean tweet') top = topwords(test, 'Clean tweet', top) main_domain = join(training, 'Clean tweet') main_domain1 = join(test, 'Clean tweet') main_domain.joinall(top.top, 1) main_domain1.joinall(top.top, 1) training = main_domain.df test = main_domain1.df cols = ['Clean tweet'] try: for x in cols: del training[x] del test[x] except: pass print training['L'] training.L = training.L.replace(['y', 'n'], [True, False]) test.L = test.L.replace(['y', 'n'], [True, False]) if pca == 1: dftraining, dftest = pcaf(training, test) training = dftraining.join(training["L"]) test = dftest.join(test["L"]) try: training = training.replace(['True', 'False'], [True, False]) test = test.replace(['True', 'False'], [True, False]) except: pass headers_names = list(training.columns.values) training = training.astype(np.float64) test = test.astype(np.float64) training['L'] = training['L'].astype(bool) test['L'] = test['L'].astype(bool) headers_names.remove('L') headers_names.append('L') pca = str(pca) test = test[headers_names] training = training[headers_names] TRAINING = training.as_matrix(columns=None) TEST = test.as_matrix(columns=None) print training.dtypes main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv', index=False) main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv', index=False) arff.dump(r'documents\Arff\unsupervised' + r'\training' + pca + '.arff', TRAINING, relation="whatever", names=headers_names) arff.dump(r'documents\Arff\unsupervised' + r'\test' + pca + '.arff', TEST, relation="whatever", names=headers_names)
def join_wrap(): if len(sys.argv) < 4: print "Not enough arguments for join" #TODO add help return subn = sys.argv[2] serv = sys.argv[3] J.join(subn, serv)
def kclustering(top=100,pca=0): training=pd.read_csv('documents\csv\drunk\drunk labeling 1300'+'.csv' ) test=pd.read_csv( 'documents\csv\drunk\drunkTEXT400U'+'.csv' ) main_domain = join(training,'Clean tweet') top = topwords(test,'Clean tweet',top) main_domain = join(training,'Clean tweet') main_domain1 = join(test,'Clean tweet') main_domain.joinall(top.top,1) main_domain1.joinall(top.top,1) training=main_domain.df test=main_domain1.df cols=['Clean tweet'] try: for x in cols: del training[x] del test[x] except: pass print training['L'] training.L=training.L.replace(['y','n'], [True,False]) test.L=test.L.replace(['y','n'], [True,False]) if pca==1: dftraining, dftest=pcaf(training,test) training =dftraining.join(training["L"]) test=dftest.join(test["L"]) try: training=training.replace(['True','False'], [True,False]) test=test.replace(['True','False'], [True,False]) except: pass headers_names=list(training.columns.values) training=training.astype(np.float64) test=test.astype(np.float64) training['L']=training['L'].astype(bool) test['L']=test['L'].astype(bool) headers_names.remove('L') headers_names.append('L') pca=str(pca) test = test[headers_names] training = training[headers_names] TRAINING=training.as_matrix(columns=None) TEST=test.as_matrix(columns=None) print training.dtypes main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv',index=False) main_domain.df.to_csv(r'documents\csv\unsupervised\test.csv',index=False) arff.dump(r'documents\Arff\unsupervised'+r'\training'+pca+'.arff',TRAINING, relation="whatever", names=headers_names) arff.dump(r'documents\Arff\unsupervised'+r'\test'+pca+'.arff',TEST, relation="whatever", names=headers_names)
def addbigrams(dft,dfte,df1,selector=0,n=50): top = topwords(df1,'Clean tweet',n) bigrams=ngrams(df1,'Clean tweet') bigramsw=bigrams.bigrams main_domain = join(dft,'Clean tweet') main_domain1 = join(dfte,'Clean tweet') main_domain.joinall(bigramsw,2) main_domain1.joinall(bigramsw,2) return main_domain.df, main_domain1.df
def getmostcommon(df,df1,n=10): main_domain = join(df,'Clean tweet') main_domain1 = join(df1,'Clean tweet') top = topwords(self.df2,'Clean tweet',n) bigrams=ngrams(self.df2,'Clean tweet',n) topw=top.top bigramsw=bigrams.bigrams main_domain.joinall(topw,1) main_domain.joinall(bigramsw,2) main_domain1.joinall(topw,1) main_domain1.joinall(bigramsw,2) return main_domain.df,main_domain1.df,
def main(): warnings.warn = warn mk_dir() print('# split\n') split() print('# building dictionary\n') for file in tqdm(os.listdir('TEMP')): statinfo = os.stat('TEMP/'+file) #check if it is empty shell if statinfo.st_size>1000: try: df = build_dict(file) df = agg(df) #cant use if df on it if len(df.index)!=0: df['size'] = df['size'] = statinfo.st_size df['raw_label'] = file.replace('.pcap','') df.to_csv('TEMP_CSV/'+file.replace('.pcap','.csv'),index=False) except Exception as e: print(e) print('# collecting dataframe\n') r,c = join() print('\nGOT ',r,'nonzero records out of ',len(os.listdir('TEMP')),'files\n') print('df shape: ',r,'x',c) mk_dir()
def blsSummaryPlot(clipList, num=None): mags = np.loadtxt("kees-c5.mags", delimiter="|") nPad = mags.shape[1] # epic, blsArray = gather.gatherValue(clipList, 'bls.convolved_bls') epic, blsArray = gather.gatherFunction(clipList, getBls) #Strip out occasional bls spectrum with non standard length lengths = np.array(map(lambda x: len(x), blsArray)) typicalLength = int(np.median(lengths)) idx = lengths == typicalLength epic = np.array(epic)[idx] blsArray = np.array(blsArray)[idx] obj = np.column_stack([epic, blsArray]) obj2 = join(mags, 0, None, obj, 0, None, dtype=object) nPad += 1 # print obj2.shape magCol = 3 idx = np.argsort(obj2[:, magCol]) obj2 = obj2[idx] mag = obj2[:,magCol] blsArray = np.vstack(obj2[:, -1]) print blsArray.shape # return obj2 mp.clf() mp.imshow(blsArray, interpolation="nearest", origin="bottom",\ aspect="auto", cmap=mp.cm.YlGnBu_r) mp.colorbar() return blsArray
def test_inner_join(self): inner = join(dogs, cats, 'inner', 'name') assert len(inner) == 2 assert inner[0][0].name == 'gatsby' assert inner[0][1].meow == 'rowr' assert inner[0][0].weight == 16 assert inner[0][1].weight == 15
def blsSummaryPlot(clipList, num=None): mags = np.loadtxt("kees-c5.mags", delimiter="|") nPad = mags.shape[1] # epic, blsArray = gather.gatherValue(clipList, 'bls.convolved_bls') epic, blsArray = gather.gatherFunction(clipList, getBls) #Strip out occasional bls spectrum with non standard length lengths = np.array(map(lambda x: len(x), blsArray)) typicalLength = int(np.median(lengths)) idx = lengths == typicalLength epic = np.array(epic)[idx] blsArray = np.array(blsArray)[idx] obj = np.column_stack([epic, blsArray]) obj2 = join(mags, 0, None, obj, 0, None, dtype=object) nPad += 1 # print obj2.shape magCol = 3 idx = np.argsort(obj2[:, magCol]) obj2 = obj2[idx] mag = obj2[:, magCol] blsArray = np.vstack(obj2[:, -1]) print blsArray.shape # return obj2 mp.clf() mp.imshow(blsArray, interpolation="nearest", origin="bottom",\ aspect="auto", cmap=mp.cm.YlGnBu_r) mp.colorbar() return blsArray
def join(self, p_strLeftTable, p_strRightTable, p_lColumn, p_strType="inner"): return join.join(self.dTables[p_strLeftTable], self.dTables[p_strRightTable], p_lColumn, p_strType)
def modelExperiment(insampleData, outsampleData, dataFolder, componentList, models, modelNames, tableFile, plotFile, buildFV=True): j1 = myJoin.join(insampleData, outsampleData, dataFolder) j1.setComponentList(componentList) if buildFV: j1.buildInsampleFV() else: j1.loadCachedInsampleFV() modelResults = [] for (mod, modName) in zip(models, modelNames): j1.model = mod j1.modelName = modName precision, recall, runtime = j1.quickExperiment() modelResults.append([modName, precision, recall, runtime]) # Write summary of results to csv table writeToCSV(dataFolder + tableFile, ['', 'Precision', 'Recall', 'Runtime'], modelResults) # Write summary of results to plot precisionList = [res[1] for res in modelResults] recallList = [res[2] for res in modelResults] runtimeList = [res[3] for res in modelResults] # create plot fig, ax = plt.subplots() index = np.arange(len(models)) bar_width = 0.35 opacity = 0.8 rects1 = plt.bar(index, tuple(precisionList), bar_width, alpha=opacity, color='b', label='Precision') rects2 = plt.bar(index + bar_width, tuple(recallList), bar_width, alpha=opacity, color='g', label='Recall') plt.xlabel('Classifier') plt.ylabel('Scores') plt.title('Bootstrapped Precision and Recall Scores vs. Classifier') plt.xticks(index + bar_width, tuple(modelNames)) plt.legend() plt.tight_layout() plt.savefig(dataFolder + plotFile)
def featureVectorExperiment(insampleData, outsampleData, dataFolder, allComponents, model, modelName, tableFile, plotFile): j1 = myJoin.join(insampleData, outsampleData, dataFolder) j1.model = model j1.modelName = modelName FVResults = [] FVNames = [] for componentList in allComponents: j1.setComponentList(componentList) j1.buildInsampleFV() precision, recall, runtime = j1.quickExperiment() FVNames.append(j1.FVDescription) FVResults.append([j1.FVDescription, precision, recall, runtime]) # TODO this can be written to a function # Write summary of results to csv table wr = csv.writer(open(dataFolder + tableFile, 'wb'), quoting=csv.QUOTE_ALL) header = ['', 'Precision', 'Recall', 'Runtime'] wr.writerow(header) for row in FVResults: wr.writerow(row) # Write summary of results to plot precisionList = [res[1] for res in FVResults] recallList = [res[2] for res in FVResults] runtimeList = [res[3] for res in FVResults] # create plot fig, ax = plt.subplots() index = np.arange(len(allComponents)) bar_width = 0.35 opacity = 0.8 rects1 = plt.bar(index, tuple(precisionList), bar_width, alpha=opacity, color='b', label='Precision') rects2 = plt.bar(index + bar_width, tuple(recallList), bar_width, alpha=opacity, color='g', label='Recall') plt.xlabel('Feature Vector') plt.ylabel('Scores') plt.title( 'Bootstrapped Precision and Recall Scores using %s vs. Feature Vector' % (modelName)) plt.xticks(index + bar_width, tuple(FVNames)) plt.legend() plt.tight_layout() plt.savefig(dataFolder + plotFile)
def NLMexperiments(): j2 = myJoin.join(nlmInsampleData, nlmOutsampleData, 'NLMdata/') j2.setComponentList(fullFV) j2.loadCachedInsampleFV() results = [] for prop in np.arange(0.05, 0.25, 0.01): precision, recall, _, size = j2.classifyNIterations( subSampleProportion=prop) results.append([size, precision, recall]) writeToCSV('NLMdata/sizeTest2.csv', ['Size', 'Precision', 'Recall'], results)
def classifyAndPredict(insampleData, outsampleData, folderName, componentList): print len(insampleData[0]) print len(outsampleData[1]) # Declare instance of a join object with input arguments easyJoin = myJoin.join(insampleData, outsampleData, folderName) easyJoin.setComponentList(componentList) # Build feature vector easyJoin.buildInsampleFV() easyJoin.buildOutsampleFVReduced(0.01) # Classify and predict with logistic regression easyJoin.classify() easyJoin.classifyNIterations() easyJoin.predict()
def thresholdExperiment(insampleData, outsampleData, dataFolder, allComponents, model, modelName, tableFile): thresholdRange = np.arange(0.0, 0.01, 1.01) j1 = myJoin.join(insampleData, outsampleData, dataFolder) j1.model = model j1.modelName = modelName j1.setComponentList(allComponents) j1.loadCachedInsampleFV() expResults = [] for tHold in thresholdRange: precision, recall, runtime = j1.classifyNIterations(tHold) expResults.append([tHold, precision, recall, runtime]) print tHold writeToCSV(dataFolder + tableFile, ['Threshold', 'Precision', 'Recall', 'Runtime'], expResults)
def PARRES(tPAR0,tRED0): """ """ tPAR = copy.deepcopy(tPAR0) tRED = copy.deepcopy(tRED0) tPAR.set_primary_key('seed') tRED.set_primary_key('seed') tRED = join.join(tPAR,tRED) # Convert epochs back into input tRED.data['oepoch'] = np.remainder(tRED.oepoch,tRED.P) addbg(tRED) addFlag(tRED) return tRED
def buildQuery(s, tables): tokens = nltk.word_tokenize(s) Tagged = nltk.pos_tag(tokens) subjects = 0 # joined = False for item in Tagged: if item[1] == 'NNS': subjects += 1 if subjects >= 2: # joined = True s = join(s, tables) s = Dict.DoReplacing(s) # s = 'Show me when both Mario and Sonic first came out' s = s.strip() # Remove Leading and Trailing Whitespace s = ReplaceNotFirst(s, 'where', 'and') s = s.replace('and and', 'and') # Remove generic database references if s.lower().find("from database") != -1: s = Dict.Replace(s, [" from database"], '') tokens = nltk.word_tokenize(s) # if there is no "from" if s.find('from') == -1 and s.find('select') != -1: temp = tokens[tokens.index('select') + 1] temps = singularize(temp) tempp = pluralize(temps) s = s.replace('select ' + temp, 'select ' + temps + ' from ' + tempp) tokens = nltk.word_tokenize(s) if tokens[tokens.index('from') + 1] not in tables: s = noTableName(s, tables, tokens[tokens.index('from') + 1]) if s[-1] != ';': s = s + ';' tokens = s.split(' ') s = manageStringVars(s, tokens) # above func takes in string and list, looks for operator and if the following # string is not a digit, then the function will add quotes to it return s
def getintersection(df,selector=0,n=50): main_domain = join(df,'Clean tweet') top = topwords(df,'Clean tweet',n) bigrams=ngrams(df,'Clean tweet') topw=top.top bigramsw=bigrams.bigrams main_domain.joinall(topw,1) mutualwordsu= mutualinfo(main_domain.df) main_domain.joinall(bigramsw,2) mutualwordsb= mutualinfo(main_domain.df) mutualwordsb=[e for e in mutualwordsb if e not in mutualwordsu] ratiov=ratio(main_domain.df,'L') ratios=ratiov.getoddratios(top.top) dratios=list(ratios.keys()) return topw, bigramsw, dratios,mutualwordsu,mutualwordsb
def SOexperiments(): j1 = myJoin.join(SOInsampleData, SOOutsampleData, 'stackoverflowdata/') j1.setComponentList(fullFV) j1.buildInsampleFV() j1.model = RF(n_estimators=200) j1.modelName = 'RF' def threshHoldTest(): singleThreshTest = j1.thresholdTest(np.arange(0.0, 1.01, 0.01)) writeToCSV('stackoverflowdata/simpleThresholdTest1.csv', ['Threshold', 'Precision', 'Recall'], singleThreshTest) print 'simple thing done' fiftyThreshTest = [ j1.thresholdTest(np.arange(0.0, 1.01, 0.01)) for i in range(50) ] mean_values = np.mean(fiftyThreshTest, axis=0) writeToCSV('stackoverflowdata/fiftyThresholdTest1.csv', ['Threshold', 'Precision', 'Recall'], mean_values) print 'fifty thing done' thresholdExperiment(SOInsampleData, SOOutsampleData, 'stackoverflowdata/', fullFV, RF(n_estimators=200), 'RF', 'thresholdExperiment1.csv')
def join(self, right, left_outer=False, right_outer=False, **keys): ''' ''' from atom import Atom from join import join new = self for right in right if isinstance(right, list) else [right]: name = right.__name__ left_index = None right_index = None mask = None index = join(self, right, keys, left_outer, right_outer) right = Element(source=right, cnames=[x for x in right.__cnames__ if x not in keys.values()], index=Atom(index.right.new_index[index.mask], mask=(index.right.new_index>=0)[index.mask])) new = Element(source = new, index = Atom(index.left.new_index[index.mask], mask=(index.left.new_index>=0)[index.mask]), cnames = new.__cnames__ + [name]) setattr(new, name, right) return new
def join(self, right, left_outer=False, right_outer=False, **keys): ''' ''' from atom import Atom from join import join new = self for right in right if isinstance(right, list) else [right]: name = right.__name__ left_index = None right_index = None mask = None index = join(self, right, keys, left_outer, right_outer) right = Element( source=right, cnames=[x for x in right.__cnames__ if x not in keys.values()], index=Atom(index.right.new_index[index.mask], mask=(index.right.new_index >= 0)[index.mask])) new = Element(source=new, index=Atom(index.left.new_index[index.mask], mask=(index.left.new_index >= 0)[index.mask]), cnames=new.__cnames__ + [name]) setattr(new, name, right) return new
def outer(self, right, **keys): ''' ''' return join(self, right, True, True, **keys)
infile.close() timepattern = r'(\d{2}):(\d{2}):(\d{2})' datepattern = r'(\d{4}):(\d{2}):(\d{2})' group1 = re.search(timepattern,temp[-1]).groups() group2 = re.search(datepattern,temp[-2]).groups() return group1, group2 def prefix(time,datfredrik@fredrik-Aspire-V3-571:~/uio/inf3331/uke5$ python jpegrename.py tmp2.jpg 2002_05_19_18_10_03_tmp2.jpg e,filename): """ Creates a new file extension if needed. Does not actually rename the file though! """ prefix = filename pattern = r'\d{4}_+\d{2}_+\d{2}_+\d{2}_+\d{2}_+\d{2}_+[img]_+\d+\.[jpg]' match = re.search(pattern,filename) if not match: prefix = join.join('_',date,time,filename) return prefix tmp,gls =gettime(somefile) bla = prefix(tmp,gls,somefile) print bla #os.rename(somefile,bla) ''' fredrik@fredrik-Aspire-V3-571:~/uio/inf3331/uke5$ python jpegrename.py tmp2.jpg 2002_05_19_18_10_03_tmp2.jpg '''
def measure_join(): s = join.join(int(1e6)) print("{}".format(len(s)))
def parse_input(filename,tiles,globalfreevars): #tiles is a global list of actually known tiles # it is used to synchronise symbols between multiple automata # globalfreevars con contains=load_input(filename) preds={} if (re.search("^RootCall",contains[0])): # NEW version # parse the "RootCall" # first check, whether there are existentially quentified parameters ex_params=[] if (re.search("^RootCall\\\\E",contains[0])): ex_params=re.sub('^RootCall\\\\E([^\.]*)\..*$','\\1',contains[0]) ex_params=re.split(",",ex_params) contains[0]=re.sub("\\\\E([^\.]*)\.","",contains[0]) # first check, whether the join operation is needed --- RootCall contains "*" rootcall=re.sub('^RootCall',"",contains[0]) del contains[0] top_calls=[] pt_seq=0 while (re.search("\\*",rootcall)): # first handle "->" predicate if re.search("^[^\*]*->",rootcall): lhs=re.sub("^([^-]*)->.*$","\\1",rootcall) rhs=re.sub("^[^-]*->([^\*]*)\*.*$","\\1",rootcall) rhs=re.split(",",rhs) # remove nil and double occurences from rhs rhs_not_nil=[] for x in rhs: if (not x=="nil") and (not x==lhs) and (not x in rhs_not_nil): rhs_not_nil.append(x) # create an unique predicate for the points-to pred_name=get_unique_name("pt%i"%pt_seq,contains) pt_seq=pt_seq+1 top_calls.append((pred_name,[lhs]+rhs_not_nil)) rule=(lhs,rhs,[],[]) preds[pred_name]=([lhs]+rhs_not_nil,[rule]) # do implicit quantification add_implicit_exists(ex_params,globalfreevars,[lhs]+rhs_not_nil) else: # store the predicate call into top_calls call=re.sub("^([^\(]*)\(.*$","\\1",rootcall) call_params=re.sub("^[^\(]*\(([^\)]*)\).*$","\\1",rootcall) call_params=re.split(",",call_params) top_calls.append((call,call_params)) # do implicit quantification add_implicit_exists(ex_params,globalfreevars,call_params) # remove the call rootcall=re.sub("^[^\*]*\*","",rootcall) if re.search("^[^\*]*->",rootcall): lhs=re.sub("^([^-]*)->.*$","\\1",rootcall) rhs=re.sub("^[^-]*->([^\*]*)$","\\1",rootcall) rhs=re.split(",",rhs) # remove nil and double occurences from rhs rhs_not_nil=[] for x in rhs: #if not x=="nil": if (not x=="nil") and (not x==lhs) and (not x in rhs_not_nil): rhs_not_nil.append(x) # create an unique predicate for the points-to pred_name=get_unique_name("pt%i"%pt_seq,contains) pt_seq=pt_seq+1 top_calls.append((pred_name,[lhs]+rhs_not_nil)) rule=(lhs,rhs,[],[]) preds[pred_name]=([lhs]+rhs_not_nil,[rule]) # do implicit quantification add_implicit_exists(ex_params,globalfreevars,[lhs]+rhs_not_nil) else: call=re.sub("^([^\(]*)\(.*$","\\1",rootcall) call_params=re.sub("^[^\(]*\(([^\)]*)\).*$","\\1",rootcall) call_params=re.split(",",call_params) top_calls.append((call,call_params)) # do implicit quantification add_implicit_exists(ex_params,globalfreevars,call_params) type=2 else: # OLD version just for compatibility reasons # get parameters if not (re.search("^Params",contains[0])): raise InputError("No \"Params\" specified on 1st (nonempty) line of input") params=re.sub("^Params","",contains[0]) params=re.split(",",params) del contains[0] # get root rule identifier if not (re.search("^Root",contains[0])): raise InputError("No \"Root\" specified on 2st (nonempty) line of input") root_rule=re.sub("^Root","",contains[0]) del contains[0] type=0 #Parse predicates empty_rule=0 for x in contains: empty_rule=empty_rule+parse_predicate(x,preds) if empty_rule: # empty rules in the system of predicates -> inline them + create a formula for empty heap emptyheap_eq=inline_empty_rules(preds,top_calls) else: emptyheap_eq=[] # No empty heap defined by the system of predicates ---> false represented as [] if type==2: # type==2: join operator on top level calls to translate into a single Rootcall (root_rule,params,emptyheap_eq)=join.join(preds,top_calls,emptyheap_eq,ex_params) # rename all variables in conflict between parameters and predicates rename_conflicts_with_params(preds,params) #track and eliminate all parameters for i in range(0,len(params)): ex_quantif=(params[i] in ex_params) # or (params[i]=="nil") # nil is allways handled as existentially quantified variable root_rule=trackeliminate(preds,root_rule,0,params[i],ex_quantif) # remove unreachable predicates remove_unreachable_predicates(preds,root_rule) # remove "nil" from params new_params=[] for i in params: if i!="nil" and not (i in ex_params): new_params.append(i) params=new_params # remove ex_params from emptyheap_eq new_emptyheap_eq=[] for disj in emptyheap_eq: new_disj=[] for conj in disj: new_conj=[] for x in conj: if not x in ex_params: new_conj.append(x) if len(new_conj)>1: new_disj.append(new_conj) new_emptyheap_eq.append(new_disj) emptyheap_eq=new_emptyheap_eq else: # OLD version, just for compatibility reasons (type==0) if not emptyheap_eq==[]: emptyheap_eq=emptyheap_eq[0] sig=compute_signature(preds) aut,eq_edges=sl2ta(preds,sig,params,tiles,root_rule) #if eq_edges: # print "WARNING: equality edges in use" return aut,emptyheap_eq,eq_edges
def domain(document, crossvalidationundersampling,ArffL,A=0, undersampler=0,sentiment=0 ): test=pd.read_csv('documents\csv\drunk\drunkTEXT400'+'.csv' ) test.L=test.L.replace(['y','n'], ['True','False']) df1=pd.read_csv(document+'.csv' ) df1.L=df1.L.replace(['y','n'], ['True','False']) joinc=joindocuments(df1,df1) top = topwords(df1,'Clean tweet',100) main_domain = join(df1,'Clean tweet') bigrams=ngrams(df1,'Clean tweet') print 'bigrams' print bigrams.bigrams main_domain.joinall(bigrams.bigrams,2) main_domain.joinall(top.top,1) main_domain.df.to_csv('prueba.csv',index=False) ratiov=ratio(main_domain.df,'L') ratios=ratiov.getoddratios(top.top) print 'ratios' print ratios ds=list(ratios.keys()) testobject = join(test,'Clean tweet') oddradiojoin=join(df1,'Clean tweet') oddradiojoin.joinall(ds,1) testobject.joinall(ds,1) oddradiojoin.joinall(bigrams.bigrams,2) testobject.joinall(bigrams.bigrams,2) test=testobject.df cols=['Clean tweet'] if sentiment==1: cols=['Clean tweet','sentiment_polarity', 'sentiment_subjectivity', 'absPolarity'] try: for x in cols: del oddradiojoin.df[x] del test[x] except: pass #training, test=joinc.gettrainingandtestp(oddradiojoin.df) print 'matrix of elements to reduce' print "saul,",oddradiojoin.df.shape ######################################################### if undersampler==1: print "saul,",oddradiojoin.df.shape oddradiojoin.df=joinc.undersampling(oddradiojoin.df) print oddradiojoin.df.shape if A==1: dftraining, dftest=pcaf(oddradiojoin.df,test) oddradiojoin.df =dftraining.join(oddradiojoin.df["L"]) test=dftest.join(test["L"]) print oddradiojoin.df.shape training=oddradiojoin.df training=training.replace(['True','False'], [True,False]) test=test.replace(['True','False'], [True,False]) training=training.astype(np.float64) test=test.astype(np.float64) training['L']=training['L'].astype(bool) test['L']=test['L'].astype(bool) A=str(A) sentiment=str(sentiment) oddradiojoin.df.to_csv('crossvalidation.csv',index=False) #undersampleddf1.to_csv(str(crossvalidationundersampling) +'\undersampling'+A+'.csv',index=False) headers_names=list(training.columns.values) headers_names.remove('L') headers_names.append('L') headers_names1=list(test.columns.values) print headers_names,'heathers test',headers_names1 test = test[headers_names] training = training[headers_names] print 'training' +str(training.dtypes) test.to_csv(str(crossvalidationundersampling) + r'\test1'+A+'.csv',index=False) training.to_csv(str(crossvalidationundersampling) +r'\training1'+A+'.csv',index=False) TRAINING=training.as_matrix(columns=None) TEST=test.as_matrix(columns=None) print 'training' print training.dtypes arff.dump(ArffL +r'\trainingwu'+A+str(undersampler)+sentiment+'.arff',TRAINING, relation="whatever", names=headers_names) arff.dump(ArffL +r'\testwu'+A+str(undersampler)+sentiment+'.arff',TEST, relation="whatever", names=headers_names)
def right_outer(self, right, **keys): ''' ''' return join(self, right, False, True, **keys)
def inner(self, right, **keys): ''' ''' return join(self, right, False, False, **keys)
def test_outer_join(self): outer = join(dogs, cats, 'outer', 'weight') assert len(outer) == 8
def spectralcluster(self,A=1, varydocument=0,joineig=0,undersamplingv=False): #1 join the documents and get the test sample def joind(df1,df2,size=0.1,undersamplingv=False, varydocument=0): df1.L=df1.L.replace(['y','Y','n','n '], ['True','True','False','False']) df2.L=df2.L.replace(['y','Y','n','n '], ['True','True','False','False']) joindf=joindocuments(df1,df2) if varydocument==0: df1,otro=joindf.gettrainingandtestp(df1,size) if varydocument==1: df2,otro=joindf.gettrainingandtestp(df2,size) joinc=joindocuments(df1,df2) if undersamplingv==True: df2=joinc.undersampling(df2) df1=joinc.undersampling(df1) undersampleddf1=joinc.join(df1,df2) return undersampleddf1 undersampleddf1=joind(self.df1,self.df2,A, undersamplingv,varydocument) #undersampleddf1, undertest=joinc.joinsourcetarget(A,varydocument) undertest=pd.read_csv('documents\csv\drunk\drunkTEXT400U'+'.csv' ) undertest.L=undertest.L.replace(['y','Y','n','n '], [True,True,False,False]) #join the domain specific features to the training and sample laplacian= Laplacian_matrix(self.df1,self.df2,'Clean tweet') la,ds,di=laplacian.LAPLACE_NORMALIZED() self.domain_specific=ds #+['sentiment_polarity','sentiment_subjectivity','absPolarity'] self.laplacian_matrix=la joiner=join(undersampleddf1,'Clean tweet') def getmostcommon(df,df1,n=10): main_domain = join(df,'Clean tweet') main_domain1 = join(df1,'Clean tweet') top = topwords(self.df2,'Clean tweet',n) bigrams=ngrams(self.df2,'Clean tweet',n) topw=top.top bigramsw=bigrams.bigrams main_domain.joinall(topw,1) main_domain.joinall(bigramsw,2) main_domain1.joinall(topw,1) main_domain1.joinall(bigramsw,2) return main_domain.df,main_domain1.df, tainingt, testt=getmostcommon(undersampleddf1,undertest,10) ################################### #tainingt, testt=addbigrams(tainingt,testt,self.df2) self.matrixtodott,self.matrixtodotest=joiner.jointwodocuments(undersampleddf1,undertest,ds,1) if joineig==0: trainingset,headerst=self.domain_specificbyeigenvector(tainingt) testset,headerstest=self.domain_specificbyeigenvector(testt) elif (joineig==1) or (joineig==2): trainingset=tainingt testset=testt if joineig==2: trainingset,headerst=self.domain_specificbyeigenvector(tainingt) testset,headerstest=self.domain_specificbyeigenvector(testt) headerst=headerst+['L'] trainingset=trainingset[headerst] testset=testset[headerst] return trainingset , testset
while m3u8_info[0][i] <= end_flag: if i >= m3u8_num: break i+=1 end = i #下载分片文件 pro = ProgressBar(total = end+1-start) starttime = datetime.datetime.now() for i in range(start,end+1): savepath_filename = path + '\\ts\\' + '%06d'%i + '.ts' downloader(m3u8_info[1][i] , savepath_filename) untilnowtime = datetime.datetime.now() interval = (untilnowtime - starttime).seconds pro.move() pro.log() #下面开始合并文件 fromdir = path + '\\ts\\' tofile = path + '\\' + title + '[' + str(start_flag) + '-' + str(end_flag) + ']' + '.ts' join(fromdir, tofile) print ('合并文件成功!') shutil.rmtree(fromdir) os.mkdir(fromdir) print ('清理临时文件成功!') endtime = datetime.datetime.now() interval = (endtime - starttime).seconds print ('共计用时 ' + str(interval/60) + ' min (' + str(interval) + ' s )')
def alltogether(A,varydocument=0,x=0): df1=pd.read_csv('documents\csv\pregnancy\GOOD LABELING 170620151'+'.csv' ) df2=pd.read_csv('documents\csv\drunk\drunk labeling 1300'+'.csv' ) laplacian= Laplacian_matrix(df1,df2,'Clean tweet') la,ds,di=laplacian.LAPLACE_NORMALIZED() classesname=ds+di n=len(classesname) print 'titles' print n allclusters=[] print x prueba= cluster(la,x,classesname) lencluster, clusterd=prueba.cluster allclusters.append(clusterd) clusterslong=[] for x in allclusters: for y in x: if len(y)>1 and len(y)<100: clusterslong.append(y) print clusterslong clustersall=[] for i in clusterslong: if i not in clustersall: clustersall.append(i) print len(clustersall) print 'pass1' joinc=joindocuments(df1,df2) print 'pass1.a' undersampleddf1, undertest=joinc.joinsourcetarget(A,varydocument) joiner=join(undersampleddf1,'Clean tweet') print 'pas2' tainingt, testt=joiner.jointwodocuments(undersampleddf1,undertest,clustersall,4) print 'ta' ratiov=ratio(tainingt,'L') a=[ 'L', 'absPolarity', 'sentiment_polarity', 'sentiment_subjectivity'] cols=['Clean tweet'] try: for x in cols: del tainingt[x] del testt[x] except: pass headers_names=list(tainingt.columns.values) headers_names.remove('L') headers_names.append('L') tainingt=tainingt[headers_names] testt=testt[headers_names] tainingt=tainingt.replace(['True','False'], [True,False]) testt=testt.replace(['True','False'], [True,False]) TRAINING=tainingt.as_matrix(columns=None) A=str(A) arff.dump( r'documents\Arff\cluster\trainning'+A+'.arff',TRAINING, relation="whatever", names=headers_names) TEST=testt.as_matrix(columns=None) arff.dump(r'documents\Arff\cluster\test'+A+'.arff',TEST, relation="whatever", names=headers_names)
def domain(document, crossvalidationundersampling, ArffL, A=0, undersampler=0, sentiment=0): test = pd.read_csv('documents\csv\drunk\drunkTEXT400' + '.csv') test.L = test.L.replace(['y', 'n'], ['True', 'False']) df1 = pd.read_csv(document + '.csv') df1.L = df1.L.replace(['y', 'n'], ['True', 'False']) joinc = joindocuments(df1, df1) top = topwords(df1, 'Clean tweet', 100) main_domain = join(df1, 'Clean tweet') bigrams = ngrams(df1, 'Clean tweet') print 'bigrams' print bigrams.bigrams main_domain.joinall(bigrams.bigrams, 2) main_domain.joinall(top.top, 1) main_domain.df.to_csv('prueba.csv', index=False) ratiov = ratio(main_domain.df, 'L') ratios = ratiov.getoddratios(top.top) print 'ratios' print ratios ds = list(ratios.keys()) testobject = join(test, 'Clean tweet') oddradiojoin = join(df1, 'Clean tweet') oddradiojoin.joinall(ds, 1) testobject.joinall(ds, 1) oddradiojoin.joinall(bigrams.bigrams, 2) testobject.joinall(bigrams.bigrams, 2) test = testobject.df cols = ['Clean tweet'] if sentiment == 1: cols = [ 'Clean tweet', 'sentiment_polarity', 'sentiment_subjectivity', 'absPolarity' ] try: for x in cols: del oddradiojoin.df[x] del test[x] except: pass #training, test=joinc.gettrainingandtestp(oddradiojoin.df) print 'matrix of elements to reduce' print "saul,", oddradiojoin.df.shape ######################################################### if undersampler == 1: print "saul,", oddradiojoin.df.shape oddradiojoin.df = joinc.undersampling(oddradiojoin.df) print oddradiojoin.df.shape if A == 1: dftraining, dftest = pcaf(oddradiojoin.df, test) oddradiojoin.df = dftraining.join(oddradiojoin.df["L"]) test = dftest.join(test["L"]) print oddradiojoin.df.shape training = oddradiojoin.df training = training.replace(['True', 'False'], [True, False]) test = test.replace(['True', 'False'], [True, False]) training = training.astype(np.float64) test = test.astype(np.float64) training['L'] = training['L'].astype(bool) test['L'] = test['L'].astype(bool) A = str(A) sentiment = str(sentiment) oddradiojoin.df.to_csv('crossvalidation.csv', index=False) #undersampleddf1.to_csv(str(crossvalidationundersampling) +'\undersampling'+A+'.csv',index=False) headers_names = list(training.columns.values) headers_names.remove('L') headers_names.append('L') headers_names1 = list(test.columns.values) print headers_names, 'heathers test', headers_names1 test = test[headers_names] training = training[headers_names] print 'training' + str(training.dtypes) test.to_csv(str(crossvalidationundersampling) + r'\test1' + A + '.csv', index=False) training.to_csv(str(crossvalidationundersampling) + r'\training1' + A + '.csv', index=False) TRAINING = training.as_matrix(columns=None) TEST = test.as_matrix(columns=None) print 'training' print training.dtypes arff.dump(ArffL + r'\trainingwu' + A + str(undersampler) + sentiment + '.arff', TRAINING, relation="whatever", names=headers_names) arff.dump(ArffL + r'\testwu' + A + str(undersampler) + sentiment + '.arff', TEST, relation="whatever", names=headers_names)
def main(): op = argparse.ArgumentParser(add_help = False) o = op.add_argument o("-?", "--help", action = "help"), o("-i", "--input", help = "input file") o("-c", "--cfile", help = "c output file") o("-C", "--comments", help = "keep comments", action = "store_true") o("-h", "--hfile", help = "h output file") o("-n", "--name", help = "module name") o("-p", "--prefix", help = "header guard prefix", default = "") o("-N", "--no-lines", action = "store_true", help = "don't generate #line directives") o("-s", "--sfile", help = "intermediate code output file") o("-j", "--join", nargs = "+", help = "files to join") o("-o", "--output", help = "source code output file") o("-t", "--ctypes", help = "ctypes output file") o("-T", "--terminal", action = "store_true") o("-d", "--dfile", help = "output api docs") o("-DT", "--debug-tokens") options = op.parse_args() p = None if options.terminal: terminal.run() exit(0) if options.join: if options.output: f = open(options.output, "wb") else: f = sys.stdout join.join(options.join, f) elif options.input: text = open(options.input, "rb").read().decode("utf8") p = Parser(options.input, text, comments = options.comments, options = options) try: p.parse() except MyError as e: print(e.value.encode("ascii", errors = "replace").decode("ascii")) exit(1) if not options.name: options.name = os.path.splitext(options.input)[0] else: print("No input file given with -i.") op.print_help() exit(1) if p and options.sfile: s = SWriter() code = s.generate(p) f = open(options.sfile, "wb") f.write(code.encode("utf8")) if p and options.dfile: d = dout.DWriter() code = d.generate(p, options.name) f = open(options.dfile, "wb") f.write(code.encode("utf8")) if p and (options.cfile or options.hfile): c = CWriter() try: code, header = c.generate(p, options.name, options.no_lines, options.prefix) except MyError as e: print(e) exit(1) if options.cfile: f = open(options.cfile, "wb") f.write(code.encode("utf8")) if options.hfile: f = open(options.hfile, "wb") f.write(header.encode("utf8")) if p and options.ctypes: w = CTypesWriter() code = w.generate(p) f = open(options.ctypes, "wb") f.write(code.encode("utf8"))
def test_left_join(self): left = join(dogs, cats, 'left', 'name') assert len(left) == 4
def test_right_join(self): right = join(dogs, cats, 'right', 'name') assert len(right) == 6
print("retrieving nodes...\n") circles = get_circlecontours(nodes) rois, centers = get_maskedcontours(gray, circles) print("labelling...\n") labels = [] for roi in rois: x = retrieve_digits(roi) labels.append(x) vertices = dict(zip(range(1, len(rois) + 1), labels)) print("vertices : ", vertices) raw_adj = join(edges, centers, directed) adj_matrix = [] for x in raw_adj: adj_matrix.append( (int(vertices[x]), [int(vertices[y]) for y in raw_adj[x]])) print() print("Printing Adjacency List....\n") adj_matrix = dict(adj_matrix) print(adj_matrix) cv2.waitKey(0) cv2.destroyAllWindows()