def main(argv): lexemes = [] x = 1 try: opts, args = getopt.getopt(argv, "h", ["help"]) except getopt.GetoptError: usage() sys.exit(1) for opt, arg in opts: if opt in ("-h", "--help"): usage() sys.exit() for arg in args: try: with open(arg, 'r') as f: #contents.append(f.read()) readin = f.read() myparser.parser(readin) except IOError: print "File %s not found!" % arg sys.exit(1)
def startFetch(inputurl='https://www.ccu.edu.tw/', inputLevel=7): currentLevel = inputLevel currentUrl = inputurl fetchData = fetcher(currentUrl) clock = 0 print('------------------------------') print(fetchData.time) print(fetchData.status_code) print(fetchData.url) print(fetchData.ip) print(fetchData.content) print('------------------------------') fetchData.title, contextpool, links = ps.parser(fetchData.content) fetchData.content = "".join(contextpool) insertDB(currentUrl, fetchData) #SiteDbInsert(fetchData) #title, contextpool, links = ps.parser(fetchData.content) links.sort() loadQueue() for i in links: tmpUrl = UrlQueueFilter(i, currentUrl) if (tmpUrl == False): continue Urlqueue.put([tmpUrl, str(int(currentLevel) - 1)]) saveQueue() loadQueue() while (Urlqueue.empty() == False): clock = clock + 1 item = Urlqueue.get() currentUrl = item[0] currentLevel = item[1] print("currentUrl=" + str(currentUrl)) print("currentLevel=" + str(currentLevel)) if (currentLevel == 0): continue urlIDcheck = siteDB.CheckDBUrl(currentUrl) if (urlIDcheck == 'NotInDB'): fetchData = fetcher(currentUrl) if (fetchData.status_code != 200): continue fetchData.title, contextpool, links = ps.parser(fetchData.content) fetchData.content = "".join(contextpool) insertDB(currentUrl, fetchData) links.sort() for i in links: tmpUrl = UrlQueueFilter(i, currentUrl) if (tmpUrl == False): continue Urlqueue.put([tmpUrl, str(int(currentLevel) - 1)]) if (clock % 500 == 0): print('----------SaveData--------') saveQueue() loadQueue() else: fetchData = siteDB.getIDdata(urlIDcheck) insertDB(currentUrl, fetchData)
def get_results(self): raw_results = myparser.parser(self.total_results, self.word) results = search_results() results.emails = raw_results.emails() results.hostnames = raw_results.hostnames() return results
def get_results(self): raw_results=myparser.parser(self.total_results,self.word) results = search_results() results.emails = raw_results.emails() results.hostnames = raw_results.hostnames() return results
def assembler(filename): '''open a file with .asm extension and remove whitespaces and comments then do first pass to handle symbols (labels and variables). in second pass translate in binary code''' try: if not filename.endswith('.asm'): print("Cannot process, input file format is not supported") return -1 # load assembler code with open(filename) as f: data = f.readlines() binary_code = [ ] # initialize the container to store binary translated code line_no = 0 # start line counter parser = myparser.parser() ###### First pass ###### #create and populate symbol table with predefined symbols symbol_df = pd.DataFrame(pd.read_csv('symbols.txt', sep=',', encoding='utf-16', dtype='string'), columns=['label', 'value']) symbol_table = dict(zip(symbol_df.label, symbol_df.value)) preprocessed_data = [] for line in data: line = line.strip() #remove comments if '//' in line: line = line[:line.find('//')] if line: instruction_type, components = parser.parse(line) #if it's label then update symbol table if instruction_type == 2: if not components['label'] in symbol_table: symbol_table[components['label']] = str(line_no) #this is an instruction so add to preprocessed code else: preprocessed_data.append(line) line_no += 1 ###### Second pass ###### translator = mytranslator.translator(symbol_table) for line in preprocessed_data: instruction_type, components = parser.parse(line) binary_code.append( translator.get_binary(instruction_type, components)) print(symbol_table) except Exception as e: print(e) return return binary_code
def get_people(self): rawres = myparser.parser(self.totalresults, self.word) return rawres.people_googleplus()
def get_profiles(self): rawres=myparser.parser(self.totalresults,self.word) return rawres.profiles()
def getHosts(self,domain): em=myparser.parser(self.text,domain) return em.hostnames()
def get_people(self): rawres = myparser.parser(self.totalresults, self.word) return rawres.people_linkedin()
def get_hostnames(self): rawres=myparser.parser(self.totalresults,self.word) return rawres.hostnames()
def get_set(self): rawres = myparser.parser(self.totalresults, list) return rawres.set()
import lexicalAnalyzer import sys import myparser if __name__ == "__main__" : #argv like "./test/test6.c" file_name_full = sys.argv[1] file_name = "" if file_name_full.find(".",2) != -1: file_name = file_name_full[0 : file_name_full.find(".",2)] else : file_name = file_name_full file_name_full += ".c" # do lexer lexicalAnalyzer.main(file_name_full) # check correctness result = myparser.parser(file_name + '.o') if result: print("Code is correct")
def evaluate(expression=""): value = myparser.parser(expression) if value == None: value = 'None' return {'expression': expression, 'value': value}
def getEmails(text): em=myparser.parser(text) print em.emails()
def get_results(self): raw_results=myparser.parser(self.total_results,self.word) results = search_results() results.people = raw_results.people_123people() return results
def getURLS(text): em=myparser.parser(text) print em.fileurls()
def getPeople_linkedin(text): em=myparser.parser(text) print em.people_linkedin()
def getHostnames(text): em=myparser.parser(text) print em.hostnames()
def test_emails(self): word = 'domain.com' results = '***a@domain***banotherdomain.com***[email protected]***[email protected]***' p = myparser.parser(results, word) emails = sorted(p.emails()) self.assertEquals(emails, [ '*****@*****.**', '*****@*****.**' ])
def getEmails(self): res = myparser.parser(self.text) return res.emails()
def getEmails(self): em = myparser.parser(self.text) return em.emails()
def get_urls(self): try: urls = myparser.parser(self.totalresults, "trello.com") return urls.urls() except Exception as e: print("Error occurred: " + str(e))
def getHosts(self, domain): em = myparser.parser(self.text, domain) return em.hostnames()
def get_people(self): rawres=myparser.parser(self.totalresults,self.word) return rawres.people_linkedin()
def get_files(self): rawres = myparser.parser(self.totalresults, self.word) return rawres.fileurls(self.files)
def get_emails(self): rawres=myparser.parser(self.totalresults,self.word) return rawres.emails()
def get_results(self): raw_results = myparser.parser(self.total_results, self.word) results = search_results() results.people = raw_results.people_linkedin() return results
def get_files(self): rawres=myparser.parser(self.totalresults,self.word) return rawres.fileurls(self.files)
def get_emails(self): rawres = myparser.parser(self.total_results, self.word) return rawres.emails()
def getEmails(self): em=myparser.parser(self.text) return em.emails()
def startFetch(inputurl='https://www.ccu.edu.tw/', inputLevel=6, inputspeed=0.05, inputthread=1): global SeenDB SeenDB = DBCtrl.loadSeenDB(inputthread) logging.basicConfig(level=logging.ERROR, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M', handlers=[ logging.FileHandler( 'error_' + str(inputthread) + '.log', 'w', 'utf-8'), ]) logging.debug('Hello debug!') logging.info('Hello info!') logging.warning('Hello warning!') logging.error('Hello error!') logging.critical('Hello critical!') currentBatch = currentBatchInit(inputurl, inputspeed, inputthread) startTime = time.time() saveTime = 0 queueSaveTime = 0 passTime = currentBatch.passedTime filterArr = DBCtrl.filterArrGet() loadQueue(inputthread) #讀取佇列 if (Urlqueue.empty() == True): #若空則由種子網站開始 Urlqueue.put([inputurl, 0, inputurl]) fetchCnt = currentBatch.totalFetchCnt while (Urlqueue.empty() == False): #爬取到佇列空為止 try: item = Urlqueue.get() currentUrl = item[0] currentLevel = item[1] parentUrl = item[2] print("currentUrl=" + str(currentUrl)) print("currentLevel=" + str(currentLevel)) if (int(currentLevel) >= int(inputLevel)): # 超過預定深度就不爬取 continue if isBan(currentUrl): continue urlIDcheck = siteDB.CheckDBUrl(currentUrl) # 檢查是否存在資料庫 fetchCnt = fetchCnt + 1 if (urlIDcheck == 'NotInDB'): # 若不存在則進行爬取 print('NotInDB') fetchData = fetcher( currentUrl, currentBatch.speed ) #執行FETCHER 回傳fetchData的資料型態 定義在DataStruct.py if (fetchData.status_code != 200): #檢查碼是否正常 currentBatch.failCnt = currentBatch.failCnt + 1 saveFailURL([currentUrl, parentUrl, fetchData.status_code], inputthread) continue currentBatch.successCnt = currentBatch.successCnt + 1 fetchData.title, contextpool, links = ps.parser( fetchData.content) #將爬取到的資料丟進PARSER,取得標題、連結、內文 fetchData.content = ",".join(contextpool) #將內文從ARRAY合併成一個字串 siteDB.insertDataToDB(fetchData) links.sort() #print(links) for i in links: #將連結加到QUEUE tmpUrl = UrlQueueFilter(i, currentUrl, filterArr, inputthread) if (tmpUrl == False): continue Urlqueue.put( [tmpUrl, str(int(currentLevel) + 1), currentUrl]) IPData = DataStruct.IPData(ip=fetchData.ip, url=currentUrl, fetchCount=1, isban=0, speed=currentBatch.speed, parentUrl=parentUrl, beConnectedCount=1) ipID = ipDB.CheckIPinDB(IPData.ip) if (ipID == 'NotInIPDB'): ipDB.insertDataToDB(IPData) while (1): ipID = ipDB.CheckIPinDB(IPData.ip) if (ipID == 'NotInIPDB'): time.sleep(0.5) else: break else: ipDB.updateDB(ipID, IPData) updateParentURL(IPData) else: #若該筆連結已存在資料庫中,更新他被爬取過的次數(記錄被多少網站連過),這裡沒有對內容更新,若要更新內容,需要再寫一個針對資料庫已有資料進行更新的程式。 print('indb') siteDB.updatefetchCountDB(urlIDcheck) currentBatch.redundancyUrlCnt = currentBatch.redundancyUrlCnt + 1 #統計重複的URL總數 timeLag = time.time() - startTime saveTime = saveTime + timeLag queueSaveTime = queueSaveTime + timeLag passTime = passTime + timeLag startTime = time.time() if (saveTime >= 4): #對網頁要顯示的資料進行更新 saveTime = 0 currentBatchData = [ fetchData.ip, fetchData.url, currentLevel, fetchCnt, Urlqueue.qsize(), currentBatch.failCnt, currentBatch.successCnt, currentBatch.redundancyUrlCnt, currentBatch.speed, passTime ] #IP,當前網址,當前深度,已爬取,剩餘佇列,失敗數,成功數,重複URL數,速率,經過時間 DBCtrl.currentBatchInsert(currentBatchData, inputthread) filterArr = DBCtrl.filterArrGet() with open('mutual_state.csv', newline='') as csvfile: reader = csv.reader(csvfile) rows = [row for row in reader] if (rows[inputthread][1] == "delete"): print('----------SaveData and quit--------') saveQueue(inputthread) return 0 if (queueSaveTime >= 1800): #對網頁要顯示的資料進行更新 queueSaveTime = 0 print('----------30min--SaveData--------') saveQueue(inputthread) loadQueue(inputthread) if (fetchCnt % 10000 == 0): #自動存檔功能 print('-------10000---SaveData--------') saveQueue(inputthread) loadQueue(inputthread) except Exception as e: print(e) print('--------Exception--SaveData--------') saveQueue(inputthread) loadQueue(inputthread) print('----------finish--------') saveQueue(inputthread) Mu_path = os.path.join(os.path.dirname(__file__), 'mutual_state.csv') with open(Mu_path, 'r', newline='') as csvfile: reader = csv.reader(csvfile) rows = [row for row in reader] rows[inputthread][1] = "delete" writer = csv.writer(open(Mu_path, 'w', newline='')) writer.writerows(rows) return 0
def get_people(self): rawres = myparser.parser(self.totalresults, self.word) return rawres.people_jigsaw()
from myparser import parser from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import numpy as np filtr1 = (lambda line: '(АРД)' in line) rep1 = [(r'%Pd/C(АРД)', ''), ('---', '0'), ('alkaly', '2'), ('acid', '1')] filtr2 = (lambda line: 'Al2O3' in line) rep2 = [(r'%Pd/2%C/Al2O3', '')] + rep1[1:] res1 = parser( "c.txt", (float, float, float, float, float), filt=filtr1, replacement=rep1, ) res2 = parser( "c.txt", (float, float, float, float, float), filt=filtr2, replacement=rep2, ) cat1, nature1, s, order, speed1 = res1 cat2, nature2, s, order, speed2 = res2 fig = plt.gcf() ax = fig.gca(projection='3d') surf = ax.scatter(cat1, nature1, speed1, c='b') surf1 = ax.scatter(cat2, nature2, speed2, c='r')
def get_profiles(self): rawres = myparser.parser(self.totalresults, self.word) return rawres.profiles()
def get_emails(self): rawres = myparser.parser(self.total_results, self.word) self.print_good("%s email(s) found in %s" % (len(rawres.emails()), self.engine)) #print "%s email(s) found in %s" % (len(rawres.emails()),self.engine) return rawres.emails()
def get_people(self): rawres=myparser.parser(self.totalresults,self.word) return rawres.people_jigsaw()
def get_hostnames(self): rawres = myparser.parser(self.total_results, self.word) self.print_good("%s domain(s) found in %s" % (len(rawres.hostnames()), self.engine)) #print "%s domain(s) found in %s" %(len(rawres.hostnames()),self.engine) return rawres.hostnames()
def get_hostnames(self): rawres = myparser.parser(self.total_results, self.word) return rawres.hostnames()
def getEmails(self): res=myparser.parser(self.text) return res.emails()