示例#1
0
def main(argv):
    lexemes = []
    x = 1

    try:
        opts, args = getopt.getopt(argv, "h", ["help"])
    except getopt.GetoptError:
        usage()
        sys.exit(1)

    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()

    for arg in args:
        try:
            with open(arg, 'r') as f:
                #contents.append(f.read())
                readin = f.read()
                myparser.parser(readin)

        except IOError:
            print "File %s not found!" % arg
            sys.exit(1)
示例#2
0
文件: driver.py 项目: stumped2/school
def main(argv):
  lexemes = []
  x = 1

  try:
    opts, args = getopt.getopt(argv, "h", ["help"])
  except getopt.GetoptError:
    usage()
    sys.exit(1)

  for opt, arg in opts:
    if opt in ("-h", "--help"):
      usage()
      sys.exit()

  
  for arg in args:
    try:
      with open(arg, 'r') as f:
        #contents.append(f.read())
        readin = f.read()
        myparser.parser(readin)

    except IOError:
      print "File %s not found!" % arg
      sys.exit(1)
示例#3
0
def startFetch(inputurl='https://www.ccu.edu.tw/', inputLevel=7):
    currentLevel = inputLevel
    currentUrl = inputurl
    fetchData = fetcher(currentUrl)
    clock = 0
    print('------------------------------')
    print(fetchData.time)
    print(fetchData.status_code)
    print(fetchData.url)
    print(fetchData.ip)
    print(fetchData.content)
    print('------------------------------')
    fetchData.title, contextpool, links = ps.parser(fetchData.content)
    fetchData.content = "".join(contextpool)
    insertDB(currentUrl, fetchData)

    #SiteDbInsert(fetchData)
    #title, contextpool, links = ps.parser(fetchData.content)
    links.sort()
    loadQueue()
    for i in links:
        tmpUrl = UrlQueueFilter(i, currentUrl)
        if (tmpUrl == False):
            continue
        Urlqueue.put([tmpUrl, str(int(currentLevel) - 1)])
    saveQueue()
    loadQueue()

    while (Urlqueue.empty() == False):
        clock = clock + 1
        item = Urlqueue.get()
        currentUrl = item[0]
        currentLevel = item[1]
        print("currentUrl=" + str(currentUrl))
        print("currentLevel=" + str(currentLevel))
        if (currentLevel == 0):
            continue
        urlIDcheck = siteDB.CheckDBUrl(currentUrl)
        if (urlIDcheck == 'NotInDB'):
            fetchData = fetcher(currentUrl)
            if (fetchData.status_code != 200):
                continue
            fetchData.title, contextpool, links = ps.parser(fetchData.content)
            fetchData.content = "".join(contextpool)
            insertDB(currentUrl, fetchData)
            links.sort()
            for i in links:
                tmpUrl = UrlQueueFilter(i, currentUrl)
                if (tmpUrl == False):
                    continue
                Urlqueue.put([tmpUrl, str(int(currentLevel) - 1)])
            if (clock % 500 == 0):
                print('----------SaveData--------')
                saveQueue()
                loadQueue()
        else:
            fetchData = siteDB.getIDdata(urlIDcheck)
            insertDB(currentUrl, fetchData)
示例#4
0
    def get_results(self):
        raw_results = myparser.parser(self.total_results, self.word)
        results = search_results()

        results.emails = raw_results.emails()
        results.hostnames = raw_results.hostnames()
        return results
示例#5
0
	def get_results(self):
		raw_results=myparser.parser(self.total_results,self.word)
		results = search_results()

		results.emails = raw_results.emails()
		results.hostnames = raw_results.hostnames()
		return results
示例#6
0
def assembler(filename):
    '''open a file with .asm extension and remove whitespaces
    and comments then do first pass to handle symbols (labels
    and variables). in second pass translate in binary code'''

    try:
        if not filename.endswith('.asm'):
            print("Cannot process, input file format is not supported")
            return -1

        # load assembler code
        with open(filename) as f:
            data = f.readlines()

        binary_code = [
        ]  # initialize the container to store binary translated code
        line_no = 0  # start line counter
        parser = myparser.parser()

        ###### First pass ######
        #create and populate symbol table with predefined symbols
        symbol_df = pd.DataFrame(pd.read_csv('symbols.txt',
                                             sep=',',
                                             encoding='utf-16',
                                             dtype='string'),
                                 columns=['label', 'value'])
        symbol_table = dict(zip(symbol_df.label, symbol_df.value))

        preprocessed_data = []
        for line in data:
            line = line.strip()
            #remove comments
            if '//' in line:
                line = line[:line.find('//')]

            if line:
                instruction_type, components = parser.parse(line)
                #if it's label then update symbol table
                if instruction_type == 2:
                    if not components['label'] in symbol_table:
                        symbol_table[components['label']] = str(line_no)

                #this is an instruction so add to preprocessed code
                else:
                    preprocessed_data.append(line)
                    line_no += 1

        ###### Second pass ######
        translator = mytranslator.translator(symbol_table)
        for line in preprocessed_data:
            instruction_type, components = parser.parse(line)
            binary_code.append(
                translator.get_binary(instruction_type, components))
        print(symbol_table)
    except Exception as e:
        print(e)
        return

    return binary_code
示例#7
0
 def get_people(self):
     rawres = myparser.parser(self.totalresults, self.word)
     return rawres.people_googleplus()
	def get_profiles(self):
		rawres=myparser.parser(self.totalresults,self.word)
		return rawres.profiles()
示例#9
0
	def getHosts(self,domain):
		em=myparser.parser(self.text,domain)
		return em.hostnames()
示例#10
0
 def get_people(self):
     rawres = myparser.parser(self.totalresults, self.word)
     return rawres.people_linkedin()
	def get_hostnames(self):
		rawres=myparser.parser(self.totalresults,self.word)
		return rawres.hostnames()
示例#12
0
 def get_set(self):
     rawres = myparser.parser(self.totalresults, list)
     return rawres.set()
示例#13
0
 def get_people(self):
     rawres = myparser.parser(self.totalresults, self.word)
     return rawres.people_googleplus()
示例#14
0
import lexicalAnalyzer
import sys
import myparser

if __name__ == "__main__" :
    #argv like "./test/test6.c"
    file_name_full = sys.argv[1]
    file_name = ""
    if file_name_full.find(".",2) != -1:
        file_name = file_name_full[0 : file_name_full.find(".",2)]
    else :
        file_name = file_name_full
        file_name_full += ".c"

    # do lexer
    lexicalAnalyzer.main(file_name_full)
    # check correctness
    result = myparser.parser(file_name + '.o')
    if result:
        print("Code is correct")
示例#15
0
def evaluate(expression=""):
    value = myparser.parser(expression)
    if value == None:
        value = 'None'
    return {'expression': expression, 'value': value}
示例#16
0
def getEmails(text):
 em=myparser.parser(text)
 print em.emails()
示例#17
0
	def get_results(self):
		raw_results=myparser.parser(self.total_results,self.word)
		results = search_results()
		results.people = raw_results.people_123people()
		return results
示例#18
0
def getURLS(text):
 em=myparser.parser(text)
 print em.fileurls()
示例#19
0
def getPeople_linkedin(text):
 em=myparser.parser(text)
 print em.people_linkedin()
示例#20
0
def getHostnames(text):
 em=myparser.parser(text)
 print em.hostnames()
示例#21
0
 def get_set(self):
     rawres = myparser.parser(self.totalresults, list)
     return rawres.set()
示例#22
0
 def test_emails(self):
   word = 'domain.com'
   results = '***a@domain***banotherdomain.com***[email protected]***[email protected]***'
   p = myparser.parser(results, word)
   emails = sorted(p.emails())
   self.assertEquals(emails, [ '*****@*****.**', '*****@*****.**' ])
 def getEmails(self):
     res = myparser.parser(self.text)
     return res.emails()
示例#24
0
 def getEmails(self):
     em = myparser.parser(self.text)
     return em.emails()
示例#25
0
 def get_urls(self):
     try:
         urls = myparser.parser(self.totalresults, "trello.com")
         return urls.urls()
     except Exception as e:
         print("Error occurred: " + str(e))
示例#26
0
 def getHosts(self, domain):
     em = myparser.parser(self.text, domain)
     return em.hostnames()
示例#27
0
 def get_people(self):
     rawres=myparser.parser(self.totalresults,self.word)
     return rawres.people_linkedin()
示例#28
0
 def get_files(self):
     rawres = myparser.parser(self.totalresults, self.word)
     return rawres.fileurls(self.files)
	def get_emails(self):
		rawres=myparser.parser(self.totalresults,self.word)
		return rawres.emails()
示例#30
0
 def get_results(self):
     raw_results = myparser.parser(self.total_results, self.word)
     results = search_results()
     results.people = raw_results.people_linkedin()
     return results
	def get_files(self):
		rawres=myparser.parser(self.totalresults,self.word)
		return rawres.fileurls(self.files)
示例#32
0
 def get_emails(self):
     rawres = myparser.parser(self.total_results, self.word)
     return rawres.emails()
示例#33
0
	def getEmails(self):
		em=myparser.parser(self.text)
		return em.emails()
示例#34
0
def startFetch(inputurl='https://www.ccu.edu.tw/',
               inputLevel=6,
               inputspeed=0.05,
               inputthread=1):
    global SeenDB
    SeenDB = DBCtrl.loadSeenDB(inputthread)
    logging.basicConfig(level=logging.ERROR,
                        format='%(asctime)s %(levelname)s %(message)s',
                        datefmt='%Y-%m-%d %H:%M',
                        handlers=[
                            logging.FileHandler(
                                'error_' + str(inputthread) + '.log', 'w',
                                'utf-8'),
                        ])

    logging.debug('Hello debug!')
    logging.info('Hello info!')
    logging.warning('Hello warning!')
    logging.error('Hello error!')
    logging.critical('Hello critical!')

    currentBatch = currentBatchInit(inputurl, inputspeed, inputthread)
    startTime = time.time()
    saveTime = 0
    queueSaveTime = 0
    passTime = currentBatch.passedTime
    filterArr = DBCtrl.filterArrGet()
    loadQueue(inputthread)  #讀取佇列
    if (Urlqueue.empty() == True):  #若空則由種子網站開始
        Urlqueue.put([inputurl, 0, inputurl])
    fetchCnt = currentBatch.totalFetchCnt
    while (Urlqueue.empty() == False):  #爬取到佇列空為止
        try:
            item = Urlqueue.get()
            currentUrl = item[0]
            currentLevel = item[1]
            parentUrl = item[2]
            print("currentUrl=" + str(currentUrl))
            print("currentLevel=" + str(currentLevel))
            if (int(currentLevel) >= int(inputLevel)):  # 超過預定深度就不爬取
                continue
            if isBan(currentUrl):
                continue
            urlIDcheck = siteDB.CheckDBUrl(currentUrl)  # 檢查是否存在資料庫
            fetchCnt = fetchCnt + 1
            if (urlIDcheck == 'NotInDB'):  # 若不存在則進行爬取
                print('NotInDB')
                fetchData = fetcher(
                    currentUrl, currentBatch.speed
                )  #執行FETCHER 回傳fetchData的資料型態 定義在DataStruct.py
                if (fetchData.status_code != 200):  #檢查碼是否正常
                    currentBatch.failCnt = currentBatch.failCnt + 1
                    saveFailURL([currentUrl, parentUrl, fetchData.status_code],
                                inputthread)
                    continue
                currentBatch.successCnt = currentBatch.successCnt + 1
                fetchData.title, contextpool, links = ps.parser(
                    fetchData.content)  #將爬取到的資料丟進PARSER,取得標題、連結、內文
                fetchData.content = ",".join(contextpool)  #將內文從ARRAY合併成一個字串
                siteDB.insertDataToDB(fetchData)

                links.sort()
                #print(links)
                for i in links:  #將連結加到QUEUE
                    tmpUrl = UrlQueueFilter(i, currentUrl, filterArr,
                                            inputthread)
                    if (tmpUrl == False):
                        continue
                    Urlqueue.put(
                        [tmpUrl,
                         str(int(currentLevel) + 1), currentUrl])

                IPData = DataStruct.IPData(ip=fetchData.ip,
                                           url=currentUrl,
                                           fetchCount=1,
                                           isban=0,
                                           speed=currentBatch.speed,
                                           parentUrl=parentUrl,
                                           beConnectedCount=1)
                ipID = ipDB.CheckIPinDB(IPData.ip)
                if (ipID == 'NotInIPDB'):
                    ipDB.insertDataToDB(IPData)
                    while (1):
                        ipID = ipDB.CheckIPinDB(IPData.ip)
                        if (ipID == 'NotInIPDB'):
                            time.sleep(0.5)
                        else:
                            break
                else:
                    ipDB.updateDB(ipID, IPData)
                    updateParentURL(IPData)
            else:  #若該筆連結已存在資料庫中,更新他被爬取過的次數(記錄被多少網站連過),這裡沒有對內容更新,若要更新內容,需要再寫一個針對資料庫已有資料進行更新的程式。
                print('indb')
                siteDB.updatefetchCountDB(urlIDcheck)
                currentBatch.redundancyUrlCnt = currentBatch.redundancyUrlCnt + 1  #統計重複的URL總數

            timeLag = time.time() - startTime
            saveTime = saveTime + timeLag
            queueSaveTime = queueSaveTime + timeLag
            passTime = passTime + timeLag
            startTime = time.time()
            if (saveTime >= 4):  #對網頁要顯示的資料進行更新
                saveTime = 0
                currentBatchData = [
                    fetchData.ip, fetchData.url, currentLevel, fetchCnt,
                    Urlqueue.qsize(), currentBatch.failCnt,
                    currentBatch.successCnt, currentBatch.redundancyUrlCnt,
                    currentBatch.speed, passTime
                ]  #IP,當前網址,當前深度,已爬取,剩餘佇列,失敗數,成功數,重複URL數,速率,經過時間
                DBCtrl.currentBatchInsert(currentBatchData, inputthread)
                filterArr = DBCtrl.filterArrGet()

                with open('mutual_state.csv', newline='') as csvfile:
                    reader = csv.reader(csvfile)
                    rows = [row for row in reader]
                    if (rows[inputthread][1] == "delete"):
                        print('----------SaveData and quit--------')
                        saveQueue(inputthread)
                        return 0

            if (queueSaveTime >= 1800):  #對網頁要顯示的資料進行更新
                queueSaveTime = 0
                print('----------30min--SaveData--------')
                saveQueue(inputthread)
                loadQueue(inputthread)

            if (fetchCnt % 10000 == 0):  #自動存檔功能
                print('-------10000---SaveData--------')
                saveQueue(inputthread)
                loadQueue(inputthread)

        except Exception as e:
            print(e)
            print('--------Exception--SaveData--------')
            saveQueue(inputthread)
            loadQueue(inputthread)

    print('----------finish--------')
    saveQueue(inputthread)
    Mu_path = os.path.join(os.path.dirname(__file__), 'mutual_state.csv')
    with open(Mu_path, 'r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        rows = [row for row in reader]
        rows[inputthread][1] = "delete"
        writer = csv.writer(open(Mu_path, 'w', newline=''))
        writer.writerows(rows)
    return 0
示例#35
0
 def get_people(self):
     rawres = myparser.parser(self.totalresults, self.word)
     return rawres.people_jigsaw()
示例#36
0
from myparser import parser
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

filtr1 = (lambda line: '(АРД)' in line)
rep1 = [(r'%Pd/C(АРД)', ''), ('---', '0'), ('alkaly', '2'), ('acid', '1')]

filtr2 = (lambda line: 'Al2O3' in line)
rep2 = [(r'%Pd/2%C/Al2O3', '')] + rep1[1:]

res1 = parser(
    "c.txt",
    (float, float, float, float, float),
    filt=filtr1,
    replacement=rep1,
)

res2 = parser(
    "c.txt",
    (float, float, float, float, float),
    filt=filtr2,
    replacement=rep2,
)

cat1, nature1, s, order, speed1 = res1
cat2, nature2, s, order, speed2 = res2
fig = plt.gcf()
ax = fig.gca(projection='3d')
surf = ax.scatter(cat1, nature1, speed1, c='b')
surf1 = ax.scatter(cat2, nature2, speed2, c='r')
示例#37
0
 def get_profiles(self):
     rawres = myparser.parser(self.totalresults, self.word)
     return rawres.profiles()
示例#38
0
 def get_emails(self):
     rawres = myparser.parser(self.total_results, self.word)
     self.print_good("%s email(s) found in %s" %
                     (len(rawres.emails()), self.engine))
     #print "%s email(s) found in %s" % (len(rawres.emails()),self.engine)
     return rawres.emails()
示例#39
0
 def get_people(self):
     rawres=myparser.parser(self.totalresults,self.word)
     return rawres.people_jigsaw()
示例#40
0
 def get_hostnames(self):
     rawres = myparser.parser(self.total_results, self.word)
     self.print_good("%s domain(s) found in %s" %
                     (len(rawres.hostnames()), self.engine))
     #print "%s domain(s) found in %s" %(len(rawres.hostnames()),self.engine)
     return rawres.hostnames()
示例#41
0
 def get_hostnames(self):
     rawres = myparser.parser(self.total_results, self.word)
     return rawres.hostnames()
示例#42
0
	def getEmails(self):
		res=myparser.parser(self.text)
		return res.emails()