예제 #1
0
    def rank(self):
        dictionary = rw.readFile(self.dict_dir).split("\n")[0:-1]
        dic = set()
        sample = rw.readFile(self.res_dir).split("\n")[0:-1]
        words = {}
        high_freq = []
        content = ""

        for line in dictionary:
            temp = line.split("\t")
            dic.add(temp[0])

        for line in sample:
            temp = line.split(":")
            if (temp[0] in dic):
                high_freq.append(line)

        limit = int(round(len(high_freq) * self.ratio))
        for line in high_freq[:limit]:
            #content += (line + "\n")
            temp = line.split(':')
            content += (temp[0] + '\n')
            self.dic_word[temp[0]] = int(temp[1])

        rw.writeFile(self.rank_dir,content)
예제 #2
0
 def readDictionary(self):
     files = os.listdir(self.dic_dir)
     for filename in files:
         if filename.endswith('.txt'):
             key = filename[0:-4]
             if (key not in self.dic):
                 path = self.dic_dir + '/' + filename
                 self.dic[key] = rw.readFile(path).split('\r\n')[0:-1]
예제 #3
0
 def get_text(self):
     files = os.listdir(self.src_dir)
     content = ""
     for filename in files:
         if filename == 'total.txt':
             path = self.src_dir + '/' + filename
             txt = rw.readFile(path)
             content += txt
     return content
예제 #4
0
 def addToDic(self, filename):
     src_path = self.src_dir + '/' + filename
     lines = rw.readFile(src_path).split('\n')[0:-1]
     for line in lines:
         temp = line.split('\t')
         if temp[0] in self.query_dic:
             self.query_dic[temp[0]] += int(temp[1])
         else:
             self.query_dic[temp[0]] = int(temp[1])
예제 #5
0
 def get_text(self):
     files = os.listdir(self.src_dir)
     content = ""
     for filename in files:
         if filename == 'total.txt':
             path = self.src_dir + '/' + filename
             txt = rw.readFile(path)
             content += txt
     return content
예제 #6
0
 def find_NE(self, filename):
     src_path = self.src_dir + '/' + filename
     res_path = self.res_dir + '/' + filename
     txt = rw.readFile(src_path)
     content = ""
     for key in self.dic:
         txt = self.find_key(key,txt)
     p = re.compile('.*\[.+\].*')
     lines = p.findall(txt)
     for line in lines:
         content += (line + '\n')
     rw.writeFile(res_path,content)
     return content
예제 #7
0
    def analyze(self):
        files = os.listdir(self.src_dir)
        for filename in files:
            path = self.src_dir + '/' + filename
            if os.path.isdir(path) == False:
                txt = rw.readFile(path)

                segs = self.ch.findall(txt)
                for seg in segs:
                    words = self.get_words(seg)
                    for word in words:
                        if (word in self.dictionary):
                            self.dictionary[word] += 1
                        else:
                            self.dictionary[word] = 1
        return self.dictionary
예제 #8
0
 def sub(self, filename):
     print "********************"
     print "Substituting File: %s" % filename
     src_path = self.src_dir + '/' + filename
     res_path = self.res_dir + '/' + filename
     txt = pre.sort_txt(rw.readFile(src_path), [0, 1], 20)
     query_log = txt.split('\n')[0:self.num]
     content = ""
     for line in query_log:
         temp = line.split('\t')
         substituted = self.run(temp[0])
         if substituted != "":
             print "Substituting: %s" % temp[0]
             query = temp[0] + '\t' + substituted + '\t' + temp[1] + '\n'
             content += query
     rw.writeFile(res_path, content)
예제 #9
0
 def sub(self,filename):
     print "********************"
     print "Substituting File: %s" % filename
     src_path = self.src_dir + '/' + filename
     res_path = self.res_dir + '/' + filename
     txt = pre.sort_txt(rw.readFile(src_path),[0,1],20)
     query_log = txt.split('\n')[0:self.num]
     content = ""
     for line in query_log:
         temp = line.split('\t')
         substituted = self.run(temp[0])
         if substituted != "":
             print "Substituting: %s" % temp[0]
             query = temp[0] + '\t' + substituted + '\t' + temp[1] + '\n'
             content += query
     rw.writeFile(res_path,content)
예제 #10
0
 def segment(self,filename):
     print "********************"
     print "Segmenting File: %s" % filename
     src_path = self.src_dir + '/' + filename
     res_path = self.res_dir + '/' + filename
     query_log = rw.readFile(src_path).split('\n')[0:self.num]
     content = ""
     for line in query_log:
         temp = line.split('\t')
         #print "Segmenting: %s" % temp[0]
         segmented = self.run(temp[0])
         if segmented != "":
             print "Segmenting: %s" % temp[0]
             query = temp[0] + '\t' + segmented + '\t' + temp[1] + '\n'
             content += query
     rw.writeFile(res_path,content)
예제 #11
0
    def sort_file(self,filename):
        src_path = self.src_dir + "/" + filename
        tar_path = self.tar_dir + "/" + filename
        content = rw.readFile(src_path)
        query_list = content.split("\n")

        #constants
        num = len(query_list)
        num_sorted = 0
        freq = 0
        freq_sorted = 0
        query_list_sorted = []
        content_sorted = ""
        # sort the queries over min_char
        for query in query_list:
            temp = query.split("\t")
            if len(temp) > 1:
                freq += int(temp[-1])
            if len(temp[0]) >= self.min_char:
                query_list_sorted.append(temp[0] + "\t" + temp[-1])
                freq_sorted += int(temp[-1])
                for target in self.targets:
                    index = self.targets.index(target)
                    count = (temp[0].count(target) > 0) * 1
                    self.target_num_l[index] += count#temp[0].count(target)
                    self.target_freq_l[index] += count*int(temp[-1])

        num_sorted = len(query_list_sorted)
        self.total_num += num
        self.total_num_sorted += num_sorted
        self.total_freq += freq
        self.total_freq_sorted += freq_sorted

        result = ("Query Log: %s\n" % filename) \
               + ("Number of queries: %d\n" % num) \
               + ("Queries over %d bytes: %d\n" % (self.min_char,num_sorted)) \
               + ("Queries frequency: %d\n" % freq) \
               + ("Long queries frequency: %d\n" % freq_sorted) \
               + (("Long query ratio: %0.3f\n") % (float(num_sorted)/num)) \
               + (("Long query frequency ratio: %0.3f\n") % (float(freq_sorted)/freq))

        # recombine the sorted queries
        for query in query_list_sorted:
            content_sorted += (query + "\n")

        rw.writeFile(tar_path,content_sorted)
        return result
예제 #12
0
def sort_file(src_dir, res_dir, filename, cols, min_f):
    print "Pre-processing file: %s" % filename
    src_path = src_dir + '/' + filename
    res_path = res_dir + '/' + filename
    content = rw.readFile(src_path)
    query_list = content.split('\n')[0:-1]

    #constants
    query_list_sorted = []
    content_sorted = ""

    for query in query_list:
        temp = query.split("\t")
        freq = int(temp[cols[-1]])
        if freq >= min_f:
            line = ""
            for i in xrange(len(cols)-1):
                line += (temp[cols[i]] + '\t')
            line += temp[cols[-1]]
            content_sorted += (line + '\n')
    rw.writeFile(res_path,content_sorted)
    return content_sorted
예제 #13
0
	def run(self):
		#define the server thread
		self.s = socket.socket()
		self.host = socket.gethostname()
		self.port = 2222 + self.pid * 10
		self.s.bind((self.host, self.port))
		self.s.listen(5)

		while True:
			#listen for connection
			c, addr = self.s.accept()
			print '\nGot connection from ', addr

			#initialize file index
			original_index = self.fileindex('original')
			download_index = self.fileindex('download')
			index = original_index + download_index

			#receive message
			msg = c.recv(1024)
			#parse & analyze message
			mid = msg.split(',')[0].split(':')[1]
			action = msg.split(',')[1].split(':')[0]
			name = msg.split(',')[1].split(':')[1]

			if action == 'search':
				print 'search ' + name
				ttl = msg.split(',')[2].split('=')[1]
				#read mid_list from file 'msg_list'
				mid_list = rw.readList('msg_list.txt')
				#decide broadcast or not
				if int(ttl) == 1 or mid in mid_list:
					print 'file dont need to pass'
				else:
					#update mid_list in file 'msg_list'
					mid_list.append(mid)
					rw.write('msg_list.txt', mid_list)
					#decrease the ttl value by 1 after broadcast for once
					rmsg = msg.rsplit(ttl, 1)
					msg = str(int(ttl)-1).join(rmsg)
					#start autobroadcast thread (details in autobroadcast.py module)
					broadcast = autobroadcast.Auto(msg, self.neighbor)
					broadcast.start()
					broadcast.join()
					if name in index:
						#check file's state
						meta_dict = rw.readDict('metadata.txt')
						if (name in original_index and (meta_dict['original'][name]['state'] == 'valid' or meta_dict['download'][name]['state'] == 'valid')):
							#start hitresponse thread (details in hitresponse.py module)
							msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',response:' + name
							hit = hitresponse.Hit(msg, mid)
							hit.start()
							hit.join()
						else:
							'file state is not qualified'
					else:
						print 'no file match, pass to neighbors'
			elif action == 'update':
				print 'update ' + name
				#read mid_list from file 'msg_list'
				mid_list = rw.readList('msg_list.txt')
				#decide broadcast or not
				if mid in mid_list:
					print 'update dont need to pass'
				else:
					#update mid_list in file 'msg_list'
					mid_list.append(mid)
					rw.write('msg_list.txt', mid_list)
					#start autobroadcast thread (details in autobroadcast.py module)
					broadcast = autobroadcast.Auto(msg, self.neighbor)
					broadcast.start()
					broadcast.join()
					if name in download_index:
						#set file state to invalid
						meta_dict = rw.readDict('metadata.txt')
						meta_dict['download'][name]['state'] = 'invalid'
						rw.write('metadata.txt', meta_dict)
					else:
						print 'no file need to update, pass to neighbors'
			elif action == 'check':
				print 'check ' + name
				#check the original file's version
				version = int(msg.split(',')[2].split(':')[1])
				TTR = int(msg.split(',')[3].split(':')[1])
				meta_dict = rw.readDict('metadata.txt')
				if meta_dict['original'][name]['version'] == version:
					#if same version, send a new TTR.
					newTTR = 2 * TTR
					msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',checkresponse:' + name + ',state:valid,TTR:' + str(newTTR)
				else:
					#send Invalid & new version exist.
					msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',checkresponse:' + name + ',state:invalid'
				hit = hitresponse.Hit(msg, mid)
				hit.start()
				hit.join()
			elif action == 'checkresponse':
				print 'checkresponse ' + name
				#update the metadata
				state = str(msg.split(',')[2].split(':')[1])
				if state == 'valid':
					TTR = int(msg.split(',')[3].split(':')[1])
					#change TTR to 2TTR. (details in pullrenew.py module)
					pullrenew.renew('TTR', name, TTR)
					#change state from 'TTR Expired' back to 'valid' (details in pullrenew.py module)
					pullrenew.renew('state', name, state)
				elif state == 'invalid':
					#change state from 'TTR Expired' back to 'invalid' (details in pullrenew.py module)
					pullrenew.renew('state', name, state)
			elif action == 'response':
				#start reconnect thread (details in reconnect.py module)
				print 'response ' + name + ' from ' + mid
				#read name_list from file 'req_list'
				name_list = rw.readList('req_list.txt')
				#decide reconnect or not
				if name in name_list:
					msg = 'mid:' + str(self.pid) + '|' + str(self.port) + ',obtain:' + name
					connect = reconnect.Connect(msg, mid, name)
					connect.start()
					connect.join()
				else:
					print 'file has been obtained'
			elif action == 'obtain':
				print 'obtain ' + name
				#start transfer directly to original peer
				if name in original_index:
					path = os.path.join(os.getcwd(), 'files', 'original', name)
					metadata = rw.readDict('metadata.txt')['original'][name]
				elif name in download_index:
					path = os.path.join(os.getcwd(), 'files', 'download', name)
					metadata = rw.readDict('metadata.txt')['download'][name]
				#send metadata & file
				c.sendall(json.dumps(metadata))
				#slice file into chunks by buffer
				content = rw.readFile(path)
				i = 0
				while i <= len(content):
					chunk = buffer(content, i, 1024)
					c.sendall(chunk)
					i += 1024
				print 'send file: ' + name

			c.close()
예제 #14
0
 def init_dict(self):
     words = rw.readFile(self.dict_dir).split('\n')[0:-1]
     for word in words:
         self.dictionary.add(word)
예제 #15
0
 def init_dic(self):
     lines = rw.readFile(self.dic_dir).split('\n')[0:-1]
     for line in lines:
         temp = line.split('\t')
         self.dictionary[temp[0]] = temp[1]
     return self.dictionary
예제 #16
0
 def init_dic(self):
     lines = rw.readFile(self.dic_dir).split('\n')[0:-1]
     for line in lines:
         temp = line.split('\t')
         self.dictionary[temp[0]] = temp[1]
     return self.dictionary
예제 #17
0
 def init_dict(self):
     words = rw.readFile(self.dict_dir).split('\n')[0:-1]
     for word in words:
         self.dictionary.add(word)
예제 #18
0
 def init_model(self):
     lines = rw.readFile(self.src_dir).split('\n')[0:-1]
     for line in lines:
         temp = line.split('\t')
         self.models.append(temp[0])
예제 #19
0
 def init_model(self):
     lines = rw.readFile(self.src_dir).split('\n')[0:-1]
     for line in lines:
         temp = line.split('\t')
         self.models.append(temp[0])