def loadloc(self): filereader = FileReader("../NE_INFO.csv") neididx = filereader.getattridx("NE_ID") noidx = filereader.getattridx("NE_NO") nameidx = filereader.getattridx("NE_NAME") ididx = filereader.getattridx("ID_IN_NM") self.m_locdata = [] while True: tmptran = filereader.readtransection() if tmptran is None: break innminfo = tmptran[ididx] siteididx = innminfo.find("BtsSiteMgr=BCF-") if siteididx == -1: siteididx = innminfo.find("BtsSiteMgr=") if siteididx == -1: innm = "-1" else: innm = innminfo[siteididx + len("BtsSiteMgr="):] else: # innm = innminfo[siteididx+len("BtsSiteMgr=BCF-"):] innm = "-1" self.m_locdata.append( NE(tmptran[noidx], tmptran[nameidx], tmptran[ididx], tmptran[neididx]))
def printinfo(): fnamelist = ["../10"+str(v)+".csv" for v in xrange(22,23)] cnt = 0 found = 0 wholeresult = {} # writefile = open("../cleandata","w") missloc = {} for fname in fnamelist: filereader = FileReader(fname) alarmcode = filereader.getattridx("ALARMCODE") attridx = filereader.getattridx("SUMMARY") locidx= filereader.getattridx("LOCATION") timeidx = filereader.getattridx("ALARMHAPPENTIME") cntidx = 0 while True: tmptran = filereader.readtransection() cntidx += 1 # print cntidx if tmptran is None: filereader.close() break summary = tmptran[attridx] location = tmptran[locidx] if location.startswith("SU6095-SU2551"): print "SUMMARY:" print summary print "--------------" print "LOCATION" print location
def setUp(self): self.tmpfn = tempfile.mktemp("filereadertest") fp = open(self.tmpfn, 'w') for line in self.lines: fp.write(line) fp.close() self.f = FileReader(self.tmpfn)
def tongjidistype(fname, attrname): filereader = FileReader(fname) attridx = filereader.getattridx(attrname) trandata = {} while True: tmptran = filereader.readtransection() if tmptran is None: break summary = tmptran[attridx] if summary not in trandata: trandata[summary] = 0 trandata[summary] += 1 valuelist = trandata.values() valuelist.sort() c = Counter(valuelist) keylist = c.keys() keylist.sort() for key in keylist: print key, "\t:\t", c[key] itemslist = trandata.items() itemslist.sort(key=lambda v: v[1], reverse=True) # for key,value in itemslist: # print key # print value # raw_input() # print valuelist # raw_input() print "========================================================" import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(trandata) print "length:", len(trandata)
def execution(): parser = argparse.ArgumentParser() parser.add_argument("-v", "--verbosity", type=str, help="increase output verbosity") parser.add_argument("-i", "--input", type=str, default="data_kidney", help="input") parser.add_argument("-o", "--output", type=str, default='data_kidney.jpg', help="output") args = parser.parse_args() file_reader = FileReader(args.input) scanconversion = ScanConverter(file_reader) scanconversion.convert(file_reader, args.output)
def tongjilocation(): filereader = FileReader("../1022.csv") attridx = filereader.getattridx("LOCATION") trandata = {} while True: tmptran = filereader.readtransection() if tmptran is None: break loc = tmptran[attridx] try: locinfo = loc.split(";")[1].split("/") targetloc = locinfo[0] if targetloc not in trandata: trandata[targetloc] = 0 trandata[targetloc] += 1 except: print loc # raw_input() valuelist = trandata.values() valuelist.sort() c = Counter(valuelist) keylist = c.keys() keylist.sort() for key in keylist: print key,"\t:\t",c[key] raw_input() raw_input() itemslist = trandata.items() itemslist.sort(key=lambda v:v[1],reverse=True) for key,value in itemslist: print key print value raw_input()
def __init__(self): # this is determined by whether we read from a file or not self.data_set = None self.data_targets = None self.file_reader = FileReader() # these hold our training and testing values after we split the data self.training_data = None self.training_targets = None self.test_data = None self.test_targets = None # these values hold the label encoded arrays for working with sklearn's implementation self.sklearn_training_data = None self.sklearn_training_targets = None self.sklearn_testing_data = None self.sklearn_testing_targets = None self.most_common = None self.classifier = None self.model = None self.predicted_targets = None pandas.options.mode.chained_assignment = None
def list_file_recursive(self, path): """Print list of given file's contents recursively.""" with FileReader(path, 'rb') as file: decoder = codec.getDecoderForFile(file) decoder = decoder(file, None) items = [] for obj in decoder.objects: self._list_recursive(obj)
def read_file_test(self): file_reader = FileReader("data/partitions.txt") cluster_points = file_reader.read_file_to_cluster_points() self.assertIsInstance(cluster_points, list) self.assertGreater(len(cluster_points), 1) self.assertEqual(cluster_points[0].point_id, 1) self.assertEqual(cluster_points[0].cluster_id, 2) for cluster_point in cluster_points: self.assertIsInstance(cluster_point, ClusterPoint)
def makeFileReader(self, file, mode='rb') -> FileReader: """Make a FileReader for the given file, with default settings for this app. """ log.debug("makeFileReader(%s: %s)", file, getattr(file, 'name', None)) if isinstance(file, FileReader): return file return FileReader(file, mode, endian=self.endian, defaultStringLengthFmt='H')
def printidentifier(): filereader = FileReader("../1022.csv") identifieridx = filereader.getattridx("NEIDENTIFIER") while True: tmptran = filereader.readtransection() identifier = tmptran[identifieridx] if tmptran is None: filereader.close() break print identifier raw_input()
def process_dump(input_file, out_file, workers_count): """ :param input_file: name of the wikipedia dump file; '-' to read from stdin :param out_file: directory where to store extracted data, or '-' for stdout :param workers_count: number of extraction processes to spawn. """ logging.info("Starting map reduce processes...") workers_count = max(1, workers_count) maxsize = 10 * workers_count # output queue output_queue = Queue(maxsize=maxsize) # input queue jobs_queue = Queue(maxsize=maxsize) file_reader = FileReader(input_file) database_writer = DatabaseWriter(config, buffer_size=1000) # database_writer.check_connection() workers = [] for i in range(workers_count): worker = json_processor_class(i) extractor = Instance(target=worker.execute, args=(jobs_queue, output_queue)) extractor.daemon = True # only live while parent process lives extractor.start() worker.process = extractor workers.append(worker) output = Instance(target=database_writer.execute, args=(output_queue, )) output.start() output_queue_size = lambda: output_queue.qsize() # map job that sorts and prints output map = Instance(target=file_reader.execute, args=(jobs_queue, output_queue_size)) map.start() map.join() logging.info("Completing workers...") for _ in workers: jobs_queue.put(None) for w in workers: w.process.join() logging.info("Completing database writer...") output_queue.put(None) output.join()
def loadtopo(self): self.m_directtopo = {} self.m_fatherdata = {} self.m_topodict = {} filereader = FileReader("../NE_TOPO_INFO.csv") neidx = filereader.getattridx("NE_ID") parentidx = filereader.getattridx("PARENT_NE_ID") empty = 0 nonempty = 0 halfempty = 0 while True: tmptran = filereader.readtransection() if tmptran is None: break neid = tmptran[neidx] parentneid = tmptran[parentidx] childne = self.getnebysiteid(neid) parentne = self.getnebysiteid(parentneid) if childne is None and parentne is None: empty += 1 continue elif childne is None or parentne is None: halfempty += 1 if parentne is None: childnename = childne.m_name parentnename = parentneid if childnename not in self.m_fatherdata: self.m_fatherdata[childnename] = [] self.m_fatherdata[childnename].append(parentnename) continue else: nonempty += 1 childnename = childne.m_name parentnename = parentne.m_name if childnename not in self.m_fatherdata: self.m_fatherdata[childnename] = [] self.m_fatherdata[childnename].append(parentnename) if childnename not in self.m_topodict: self.m_topodict[childnename] = [] if parentnename not in self.m_topodict: self.m_topodict[parentnename] = [] if parentnename not in self.m_directtopo: self.m_directtopo[parentnename] = [] self.m_topodict[childnename].append(parentnename) self.m_topodict[parentnename].append(childnename) self.m_directtopo[parentnename].append(childnename) print "empty:", empty print "halfempty:", halfempty print "nonempty:", nonempty
def tongjinetype(): filereader = FileReader("../NE_INFO.csv") attridx = filereader.getattridx("NE_CAT_ID") trandata = {} while True: tmptran = filereader.readtransection() if tmptran is None: break summary = tmptran[attridx] if summary not in trandata: trandata[summary] = 0 trandata[summary] += 1 import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(trandata)
def addmesh(self, meshfilelocation, pixelrange): meshfile = FileReader(meshfilelocation, "read") meshdata = [] color = [255, 255, 255] complete = False width = 1 for x in meshfile.fileOutput: if "color" in x: color = list(map(int, x.split("=")[1].split(" "))) elif "linewidth" in x: width = int(x.split("=")[1]) elif "complete" in x: complete = bool(x.split("=")[1]) else: meshdata.append(list(map(int, x.split(" ")))) self.drawMeshList.append( Mesh(meshdata, pixelrange, color, complete, width))
def test_file_reader(): # try: # reader = FileReader('file.txt') # reader.read_next_line() # reader.read_next_line() # reader.read_next_line() # # except IOError: # # print('IOError exception') # finally: # # check if variable reader exists # # this might be dangerous if other reader # # has been declared before # if ('reader' in locals()): # reader.close() # ensure that resources will be freed properly with FileReader('file.txt') as reader: reader.read_next_line()
def __init__(self): self.iris = datasets.load_iris() # this is determined by whether we read from a file or not self.data_set = None self.data_targets = None self.file_reader = FileReader() # these hold our training and testing values after we split the data self.training_data = None self.training_targets = None self.test_data = None self.test_targets = None self.classifier = None self.model = None self.predicted_targets = None
def calneidlen(): filereader = FileReader("../NE_INFO.csv") attridx = filereader.getattridx("NE_NO") lendict = {} idx = 0 while True: tmptran = filereader.readtransection() if tmptran is None: break neid = tmptran[attridx] neidlen = len(neid) if neidlen not in lendict: lendict[neidlen] = 0 lendict[neidlen] += 1 idx += 1 print neid if idx % 100 == 0: raw_input() import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(lendict)
def testwrongfile(): filereader = FileReader("../wrongdocfile") alarmcodeidx = filereader.getattridx("ALARMCODE") attridx = filereader.getattridx("SUMMARY") locidx = filereader.getattridx("LOCATION") timeidx = filereader.getattridx("ALARMHAPPENTIME") print "idxdata:", timeidx, alarmcodeidx, locidx, attridx print filereader.m_header print filereader.m_headerlen while True: tmptran = filereader.readtransection() if tmptran is None: filereader.close() break summary = tmptran[attridx] location = tmptran[locidx] alarmcode = tmptran[alarmcodeidx] timestr = tmptran[timeidx] print tmptran raw_input()
def solve(self): results = {} reader = FileReader(self.fileName) procNum = reader.readline() taskNum = reader.readline() execTimes = [reader.readline() for _ in range(taskNum)] times = [] genetics = [ PCMaxGenetic(execTimes, procNum) for _ in range(self.instNum) ] for genetic in genetics: try: _, cmax = genetic.solve(self.iterNum, self.normIter, self.mutIter) times.append(cmax) except OptimumFoundException, e: times.append(e.cmax) break
def process_file(): file_name = request.json['file_name'] chunk_size = request.json['chunk_size'] logger.file_logger.init_logger() api_manager = MeliApiManager(config.AppConfig.api_base_url) formatter = file_format_classes[config.AppConfig.file_format]( config.AppConfig.file_line_separator) file_reader = FileReader(file_name, formatter, config.AppConfig.file_encoding) fp = FileProcessor(file_reader, chunk_size, api_manager) st = time.time() fp.process() print('Total api calls were ' + str(api_manager.api_calls_count)) total_time = time.time() - st print("Time is " + str(total_time)) logger.file_logger.stop_logger() return jsonify({ "total_time_seconds": total_time, "http_requests_performed": api_manager.api_calls_count, "more_info": config.AppConfig.info })
class ReproductionAndTraining: """ Checks the commit data and reproduces the filtering It also provides a classifier that can be used on commits. """ # Files to be read f_all = os.getcwd() + "/data/raw/all_commits.txt" f_f_band = os.getcwd() + "/data/raw/filterband.txt" f_f_frame = os.getcwd() + "/data/raw/filterframe.txt" f_f_memory = os.getcwd() + "/data/raw/filtermem.txt" f_f_perf = os.getcwd() + "/data/raw/filterperf.txt" f_p_band_cache = os.getcwd() + "/data/processed/Categories/Band/cache.txt" f_p_band_redundancy = os.getcwd( ) + "/data/processed/Categories/Band/reduncancy.txt" f_p_band_throttling = os.getcwd( ) + "/data/processed/Categories/Band/throttling.txt" f_p_band_unknown = os.getcwd( ) + "/data/processed/Categories/Band/unknown.txt" f_p_frame_redundant = os.getcwd( ) + "/data/processed/Categories/Frame/redundant.txt" f_p_frame_threading = os.getcwd( ) + "/data/processed/Categories/Frame/Threading.txt" f_p_frame_unknown = os.getcwd( ) + "/data/processed/Categories/Frame/Unknown.txt" f_p_frame_visual = os.getcwd( ) + "/data/processed/Categories/Frame/Visual.txt" f_p_memory_assets = os.getcwd( ) + "/data/processed/Categories/Memory/Assests.txt" f_p_memory_fixleak = os.getcwd( ) + "/data/processed/Categories/Memory/FixLeak.txt" f_p_memory_lowmem = os.getcwd( ) + "/data/processed/Categories/Memory/LowMem.txt" f_p_memory_reducesizedata = os.getcwd( ) + "/data/processed/Categories/Memory/reduceSizeData.txt" f_p_memory_unknown = os.getcwd( ) + "/data/processed/Categories/Memory/Unknown.txt" f_p_perf_algorithm = os.getcwd( ) + "/data/processed/Categories/Perf/Algorithm.txt" f_p_perf_assets = os.getcwd( ) + "/data/processed/Categories/Perf/assets.txt" f_p_perf_caching = os.getcwd( ) + "/data/processed/Categories/Perf/caching.txt" f_p_perf_concurrency = os.getcwd( ) + "/data/processed/Categories/Perf/Concurrency.txt" f_p_perf_datastructure = os.getcwd( ) + "/data/processed/Categories/Perf/DataStructure.txt" f_p_perf_earlyreturn = os.getcwd( ) + "/data/processed/Categories/Perf/EarlyReturn.txt" f_p_perf_orderofoperations = os.getcwd( ) + "/data/processed/Categories/Perf/OrderOFOperations.txt" f_p_perf_parsing = os.getcwd( ) + "/data/processed/Categories/Perf/Parsing.txt" f_p_perf_redundancy = os.getcwd( ) + "/data/processed/Categories/Perf/redundancy.txt" f_p_perf_sqlquery = os.getcwd( ) + "/data/processed/Categories/Perf/SQLQuery.txt" f_p_perf_timeout = os.getcwd( ) + "/data/processed/Categories/Perf/TimeOut.txt" f_p_perf_unknown = os.getcwd( ) + "/data/processed/Categories/Perf/Unknown.txt" # from https://github.com/amazuerar/perf-bugs-mobile/blob/master/bug-fixing-commits-performance.csv f_external_perf = os.getcwd() + "/data/external/performance_commits.txt" # from http://gustavopinto.org/energy-aware-mining/ f_external_energy = os.getcwd() + "/data/external/energy_commits.txt" # full external sets, not just the ones they identified as relevant f_external_dataset_perf = os.getcwd( ) + "/data/external/performance_full.txt" f_external_dataset_energy = os.getcwd() + "/data/external/energy_full.txt" f_toBeClassified = os.getcwd() + "/data/commits.txt" #f_compare = os.getcwd() + "/relevant.txt" # keywords used for each type keywords_band = [ "network", "bandwidth", "size", "download", "upload", "socket" ] keywords_frame = ["jank", "frame", "respons", "lag"] # excluded "hang" keywords_memory = ["memory", "leak", "size", "cache", "buffer", "space"] keywords_perf = ["effic", "speed", "time", "perform", "slow", "fast"] keywords = [] keywords.extend(keywords_band) keywords.extend(keywords_frame) keywords.extend(keywords_memory) keywords.extend(keywords_perf) band = [] frame = [] memory = [] performance = [] # processing units fr = FileReader() def compare_keywords(self, relevant, all, compare, originalDs=False): if not originalDs: new_all = list() count_keywords = 0 for commit in all: if any(word in commit.text for word in self.keywords): count_keywords += 1 new_all.append(commit) print("Keyword filter " + str(count_keywords)) # I have to use the pre-filter here becaues my PC does not have the RAM. all = new_all # TODO pre_fn and feature_fn of best print("Starting algorithm analysis on keywords" + datetime.now().strftime("%H:%M:%S")) labels = ["relevant" for i in range(len(relevant))] relevant_features = [ " ".join(self.stem_text(x.text)) for x in relevant ] irrelevant_features = [ " ".join(self.stem_text(x.text)) for x in self.irrelevant_commits ] unknown_features = [" ".join(self.stem_text(x.text)) for x in all] features = [] features.extend(relevant_features) features.extend(irrelevant_features) features.extend(unknown_features) x = self.tf_idf(features) y = labels y.extend(["irrelevant" for i in range(len(irrelevant_features))]) print("Featurized: " + datetime.now().strftime("%H:%M:%S")) # ORIGINAL VERSION + Version where relevant are balanced x_sub = x[:len(relevant_features) + len(irrelevant_features)] y_sub = y[:len(relevant_features) + len(irrelevant_features)] x_unknown = x[len(relevant_features) + len(irrelevant_features):] classifier = DecisionTreeClassifier() # train classifier classifier.fit(x_sub, y_sub) print("Trained: " + datetime.now().strftime("%H:%M:%S")) # evaluate model x_all_prediction = classifier.predict(x_unknown) for i in range(len(x_all_prediction)): if x_all_prediction[i] == "relevant": print(all[i]) calc_relevant = 0 calc_additional = 0 for i in range(len(all)): if x_all_prediction[i] == "relevant": if all[i] in compare: calc_relevant += 1 else: calc_additional += 1 print("additional commit " + all[i].cmt_hash) print("relevant: " + str(calc_relevant)) print("additional: " + str(calc_additional)) print("results for repo:") for i in range(len(all)): if x_all_prediction[i] == "relevant": print(all[i].fullString) if originalDs: calc_perf = 0 calc_mem = 0 calc_band = 0 calc_fram = 0 for i in range(len(all)): if x_all_prediction[i] == "relevant": if all[i] in self.performance: calc_perf += 1 if all[i] in self.memory: calc_mem += 1 if all[i] in self.band: calc_band += 1 if all[i] in self.frame: calc_fram += 1 print("execution time " + str(calc_perf)) print("memory " + str(calc_mem)) print("bandwidth " + str(calc_band)) print("framerate " + str(calc_fram)) return None def __init__(self) -> None: super().__init__() self.important_words = dict() self.important_words["new_word"] = { "r1": 0.1, "r2": 2, "word": "new_word" } # init ntlk and skelearn nltk.download("punkt") nltk.download("stopwords") nltk.download("wordnet") nltk.download("averaged_perceptron_tagger") nltk.download("words") nltk.download("maxent_ne_chunker") nltk.download("vader_lexicon") self.stops = stopwords.words("english") self.count_vectorizer = CountVectorizer() self.tfdif_vectorizer = TfidfVectorizer() self.stemmer = nltk.PorterStemmer() self.lemmatizer = nltk.WordNetLemmatizer() print("Initializing") band_cache = self.fr.parse(self.f_p_band_cache) self.band.extend(band_cache) band_redundancy = self.fr.parse(self.f_p_band_redundancy) self.band.extend(band_redundancy) band_throttling = self.fr.parse(self.f_p_band_throttling) self.band.extend(band_throttling) band_unknown = self.fr.parse(self.f_p_band_unknown) self.band.extend(band_unknown) # self.check_duplicates("band", self.band) self.band = list(set(self.band)) frame_redundant = self.fr.parse(self.f_p_frame_redundant) self.frame.extend(frame_redundant) frame_threading = self.fr.parse(self.f_p_frame_threading) self.frame.extend(frame_threading) frame_unknown = self.fr.parse(self.f_p_frame_unknown) self.frame.extend(frame_unknown) frame_visual = self.fr.parse(self.f_p_frame_visual) self.frame.extend(frame_visual) # self.check_duplicates("frame", self.frame) self.frame = list(set(self.frame)) memory_assets = self.fr.parse(self.f_p_memory_assets) self.memory.extend(memory_assets) memory_fixleak = self.fr.parse(self.f_p_memory_fixleak) self.memory.extend(memory_fixleak) memory_lowmem = self.fr.parse(self.f_p_memory_lowmem) self.memory.extend(memory_lowmem) memory_unknown = self.fr.parse(self.f_p_memory_unknown) self.memory.extend(memory_unknown) memory_reducesizedata = self.fr.parse(self.f_p_memory_reducesizedata) self.memory.extend(memory_reducesizedata) # self.check_duplicates("memory", self.memory) self.memory = list(set(self.memory)) performance_algorithm = self.fr.parse(self.f_p_perf_algorithm) self.performance.extend(performance_algorithm) performance_assets = self.fr.parse(self.f_p_perf_assets) self.performance.extend(performance_assets) performance_caching = self.fr.parse(self.f_p_perf_caching) self.performance.extend(performance_caching) performance_concurrency = self.fr.parse(self.f_p_perf_concurrency) self.performance.extend(performance_concurrency) performance_datastructure = self.fr.parse(self.f_p_perf_datastructure) self.performance.extend(performance_datastructure) performance_earlyreturn = self.fr.parse(self.f_p_perf_earlyreturn) self.performance.extend(performance_earlyreturn) performance_orderofoperations = self.fr.parse( self.f_p_perf_orderofoperations) self.performance.extend(performance_orderofoperations) performance_parsing = self.fr.parse(self.f_p_perf_parsing) self.performance.extend(performance_parsing) performance_redundancy = self.fr.parse(self.f_p_perf_redundancy) self.performance.extend(performance_redundancy) performance_sqlquery = self.fr.parse(self.f_p_perf_sqlquery) self.performance.extend(performance_sqlquery) performance_timeout = self.fr.parse(self.f_p_perf_timeout) self.performance.extend(performance_timeout) performance_unknown = self.fr.parse(self.f_p_perf_unknown) self.performance.extend(performance_unknown) # self.check_duplicates("performance", self.performance) self.performance = list(set(self.performance)) self.ext_performance = self.fr.parse(self.f_external_perf) self.ext_energy = self.fr.parse(self.f_external_energy) # self.ext_performance_all = self.fr.parse(self.f_external_dataset_perf) self.ext_toBeClassified = self.fr.parse(self.f_toBeClassified) self.ext_compare = [] #self.fr.parse(self.f_compare) print("Preparing Commit Sets") relevant_commits = [] relevant_commits.extend(self.band) relevant_commits.extend(self.frame) relevant_commits.extend(self.memory) relevant_commits.extend(self.performance) # reduce duplicates self.relevant_commits = list(set(relevant_commits)) # add external set self.relevant_commits_plus = list(self.relevant_commits) self.relevant_commits_plus.extend(self.ext_performance) self.f_band = self.fr.parse(self.f_f_band) self.f_band = list(set(self.f_band)) self.f_frame = self.fr.parse(self.f_f_frame) self.f_frame = list(set(self.f_frame)) self.f_memory = self.fr.parse(self.f_f_memory) self.f_perf = self.fr.parse(self.f_f_perf) self.filtered_commits = [] self.filtered_commits.extend(self.f_band) self.filtered_commits.extend(self.f_frame) self.filtered_commits.extend(self.f_memory) self.filtered_commits.extend(self.f_perf) self.important_words = dict() irrelevant_commits = [ i for i in self.filtered_commits if i not in self.relevant_commits ] self.irrelevant_commits = list(set(irrelevant_commits)) self.all_commits = self.fr.parse(self.f_all) unknown_commits = [ i for i in self.all_commits if i not in self.relevant_commits and i not in self.irrelevant_commits ] self.unknown_commits = list(set(unknown_commits)) self.vocab = set() print("Initialization finished") def featurize(self, pre_fn, feature_fn, labels, relevant_group): # Not repairing the balanced ones as thy are pretty terrible relevant_features = [" ".join(pre_fn(x.text)) for x in relevant_group] irrelevant_features = [ " ".join(pre_fn(x.text)) for x in self.irrelevant_commits ] unknown_features = [ " ".join(pre_fn(x.text)) for x in self.unknown_commits ] features = [] features.extend(relevant_features) features.extend(irrelevant_features) features.extend(unknown_features) # VERSION RELEVANT BALANCED -> ALSO add excluded to test set # excluded_group = [x for x in self.relevant_commits if x not in relevant_group] # features.extend([" ".join(pre_fn(x.text)) for x in excluded_group]) x = feature_fn(features) y = labels y.extend(["irrelevant" for i in range(len(irrelevant_features))]) print("Featurized: " + datetime.now().strftime("%H:%M:%S")) # ORIGINAL VERSION + Version where relevant are balanced x_sub = x[:len(relevant_features) + len(irrelevant_features)] y_sub = y[:len(relevant_features) + len(irrelevant_features)] x_unknown = x[len(relevant_features) + len(irrelevant_features):] # create sets # x_train, x_test, y_train, y_test = train_test_split(x_sub, y_sub, test_size=0.2, random_state=0) # # VERSION RELEVANT BALANCED -> ALSO add excluded to test set # x_test = np.concatenate((x_test, x[len(x)-len(excluded_group):])) # y_test = np.concatenate((y_test, ["relevant" for i in range(len(excluded_group))])) # VERSION with equal irrelevant / relevant # x_sub = x[:len(relevant_features) + len(relevant_features)] # y_sub = y[:len(relevant_features) + len(relevant_features)] # x_unknown = x[len(relevant_features) + len(irrelevant_features):] # # create sets # x_train, x_test, y_train, y_test = train_test_split(x_sub, y_sub, test_size=0.2, random_state=0) # # also add the excluded back # x_test = np.concatenate((x_test, x[len(relevant_features) + len(relevant_features):len(relevant_features) + len(irrelevant_features)])) # y_test = np.concatenate((y_test, y[len(relevant_features) + len(relevant_features):len(relevant_features) + len(irrelevant_features)])) # # VERSION RELEVANT BALANCED -> ALSO add excluded to test set # x_test = np.concatenate((x_test, x[len(x) - len(excluded_group):])) # y_test = np.concatenate((y_test, ["relevant" for i in range(len(excluded_group))])) return [x_sub, y_sub] def classifier(self, data, out_txt): """" Attempts to create a classifier based on the manually filtered texts For now let's just attempt to predict "relevant" or "not-relevant" """ x_sub = data[0] y_sub = data[1] print("RUNNING CONFIG " + out_txt) # You can test different algorithms by switching the text_clf and parameters around # WARNING! Grid-Search is very expensive. classifiers = [ MLPClassifier(), KNeighborsClassifier(), SVC(), # NuSVC(), LinearSVC(), GaussianProcessClassifier(), # RBF(), DecisionTreeClassifier(), ExtraTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), # ExtraTreeClassifier(), BaggingClassifier(), GradientBoostingClassifier(), # VotingClassifier(('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('gnb', GaussianNB()), voting="soft"), GaussianNB(), MultinomialNB(), # CategoricalNB(), BernoulliNB(), ComplementNB(), QuadraticDiscriminantAnalysis(), LinearDiscriminantAnalysis(), SGDClassifier(), RidgeClassifier(), PassiveAggressiveClassifier() ] class_dict = dict() for classifier in classifiers: alg_name = type(classifier).__name__ class_dict[alg_name] = dict() count = 10 for i in range(count): x_train, x_test, y_train, y_test = train_test_split(x_sub, y_sub, test_size=0.2, random_state=0) for classifier in classifiers: alg_name = type(classifier).__name__ try: # featurize print("Start: " + alg_name + " " + datetime.now().strftime("%H:%M:%S")) # train classifier classifier.fit(x_train, y_train) print("Trained: " + datetime.now().strftime("%H:%M:%S")) # evaluate model y_pred = classifier.predict(x_test) print("Predicted: " + datetime.now().strftime("%H:%M:%S")) class_dict[alg_name]["confusion " + str(i)] = confusion_matrix( y_test, y_pred) class_dict[alg_name]["report " + str(i)] = classification_report( y_test, y_pred) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) except Exception: print(type(classifier).__name__ + " has failed") f = open(os.getcwd() + "/data/results/" + out_txt, "a") for key, val in class_dict.items(): f.write("Algorithm " + key + "\n") f.write("Confusion Matrices" + "\n") f.write( "TrueIrrelevant;FalseIrrelevant;FalseRelevant;TrueRelevant" + "\n") confusionValues = list() for i in range(count): confString = str(val["confusion " + str(i)][0][0]) + ";" + str( val["confusion " + str(i)][0][1]) + ";" + str( val["confusion " + str(i)][1][0]) + ";" + str( val["confusion " + str(i)][1][1]) f.write(confString + "\n") confusionValues.append(confString) f.write(average(confusionValues) + "\n") f.write("\nReports\n") f.write( "I_Precision;I_Recall;I_F1;I_Support;R_Precision;R_Recall;R_F1;R_Support\n" ) repValues = list() for i in range(count): repString = transformReport(val["report " + str(i)] + "\n") f.write(repString + "\n") repValues.append(repString) f.write(average(repValues) + "\n") f.write("\n\n\n") f.close() # print("Grid search result:") # print(classifier.best_params_) # allscores = classifier.cv_results_['mean_test_score'] # print(allscores) # Train Classifier for actual use # classifier.fit(x_sub, y_sub) # print("Trained: " + datetime.now().strftime("%H:%M:%S")) # # self.predict(classifier, x_unknown, self.unknown_commits, out_txt) return None def predict(self, classifier, features_x, features: List[Commit], file): """ Predicts a list of commits according to the given features and classifier :param classifier: to be used for prediction :param features_x: feature vector :param features: commits according to feature vector :param file: to print results to :return: nothing """ f = open(file, "a") prediction = classifier.predict(features_x) i = 0 print("Predicting " + str(len([x for x in prediction if x != "irrelevant"])) + " to be relevant " + file) while i < len(prediction): if prediction[i] != "irrelevant": commit = features[i] f.write("commit " + commit.cmt_hash + " " + prediction[i] + "\n") f.write("Author: " + commit.author + "\n") f.write("Date: " + commit.date + "\n") f.write("\n" + commit.text + "\n\n") i += 1 def check(self) -> List[Commit]: # pylint: disable=R0201 """" Checks if all commits are mapping correctly from raw -> filtered -> manually evaluated """ print("") print("----- Checking Commit subset validity -----") print("Loaded all " + str(len(self.all_commits)) + " Commits") # check Bandwith print("Loaded " + str(len(self.f_band)) + " Filtered Bandwith Commits") self.contains(self.all_commits, self.f_band) print("Loaded " + str(len(self.band)) + " Bandwith Commits") self.contains(self.f_band, self.band) # Check framerate print("Loaded " + str(len(self.f_frame)) + " Filtered Framerate Commits") self.contains(self.all_commits, self.f_frame) print("Loaded " + str(len(self.frame)) + " Framerate Commits") self.contains(self.f_frame, self.frame) # check memory print("Loaded " + str(len(self.f_memory)) + " Filtered Memory Commits") self.contains(self.all_commits, self.f_memory) print("Loaded " + str(len(self.memory)) + " Memory Commits") self.contains(self.f_memory, self.memory) # check performance print("Loaded " + str(len(self.f_perf)) + " Filtered Performance Commits") self.contains(self.all_commits, self.f_perf) print("Loaded " + str(len(self.performance)) + " Performance Commits") self.contains(self.f_perf, self.performance) print("----- Checking Overlaps validity -----") o_p_m = [i for i in self.performance if i in self.memory] o_p_b = [i for i in self.performance if i in self.band] o_p_j = [i for i in self.performance if i in self.frame] o_m_p = [i for i in self.memory if i in self.performance] o_m_b = [i for i in self.memory if i in self.band] o_m_j = [i for i in self.memory if i in self.frame] o_b_p = [i for i in self.band if i in self.performance] o_b_m = [i for i in self.band if i in self.memory] o_b_j = [i for i in self.band if i in self.frame] o_j_p = [i for i in self.frame if i in self.performance] o_j_m = [i for i in self.frame if i in self.memory] o_j_b = [i for i in self.frame if i in self.band] print(" Performance Memory Bandwith Jankiness") print("Performance " + str(len(self.performance)).ljust(12) + str(len(o_p_m)).ljust(7) + str(len(o_p_b)).ljust(9) + str(len(o_p_j)).ljust(8)) print("Memory " + str(len(o_m_p)).ljust(12) + str(len(self.memory)).ljust(7) + str(len(o_m_b)).ljust(9) + str(len(o_m_j)).ljust(8)) print("Bandwidth " + str(len(o_b_p)).ljust(12) + str(len(o_b_m)).ljust(7) + str(len(self.band)).ljust(9) + str(len(o_b_j)).ljust(8)) print("Jankiness " + str(len(o_j_p)).ljust(12) + str(len(o_j_m)).ljust(7) + str(len(o_j_b)).ljust(9) + str(len(self.frame)).ljust(8)) print("----- Checking Keyword validity -----") print("Relevant to Irrelevant to Unknown: " + str(len(self.relevant_commits)) + " / " + str(len(self.irrelevant_commits)) + " / " + str(len(self.unknown_commits))) # check that all commits correspond to a keyword print("The following commits do not correspond to any keyword:") for commit in self.relevant_commits: if not any(word in commit.text for word in self.keywords): print(commit.cmt_hash) print("") # check keyword efficiency self.keywords.append("optimi") self.keywords.append("storage") keydict = dict() for word in self.keywords: k = Keyword(word) keydict[word] = k sum = 0 sizedict = dict() for size in range(0, len(self.keywords)): sizedict[size] = 0 for commit in self.relevant_commits: match = "" cnt = 0 for keyword in self.keywords: if keyword in commit.text: keydict[keyword].positive_true += 1 match += keyword + " " cnt += 1 # print(str(cnt) + " " + match) sizedict[cnt] += 1 sum += cnt print("Average matched keywords: " + str(sum / len(self.relevant_commits))) print("Matches per keyword count: ") for k, v in sizedict.items(): if (v > 0): print(" " + str(k) + " - " + str(v)) for commit in self.irrelevant_commits: for keyword in self.keywords: if keyword in commit.text: keydict[keyword].positive_false += 1 for k, v in keydict.items(): ratio = 100 if (v.positive_false > 0): ratio = v.positive_true / v.positive_false print( v.keyword.ljust(10) + " p: " + str(v.positive_true).ljust(3) + " n: " + str(v.positive_false).ljust(3) + " u: " + str(v.unknown).ljust(4) + " r: " + str(ratio)[:3]) # count word occurences to see where we get print("----- Tokens in commit ratios -----") word_dict = dict() for commit in self.relevant_commits: # exclude token occuring > once for token in set(self.tokenize(commit.text)): if token in word_dict: word_dict[token] += 1 else: word_dict[token] = 1 irrelevant_commits_tokenized = [ self.tokenize(commit.text) for commit in self.irrelevant_commits ] unknown_comits_tokenized = [ self.tokenize(commit.text) for commit in self.unknown_commits ] for k, v in sorted(word_dict.items(), key=lambda item: item[1], reverse=True): negative = 0 unknown = 0 ratio = 1000000 ratio2 = 1000000 for commit in irrelevant_commits_tokenized: if k in commit: negative += 1 for commit in unknown_comits_tokenized: if k in commit: unknown += 1 if (negative > 0): ratio = v / negative if (unknown > 0): ratio2 = v / unknown self.important_words[str(k)] = { "r1": ratio, "r2": ratio2, "word": k } if ((ratio > 0.5 and unknown > 0) or (ratio2 > 0.5 and unknown > 0)) and k not in self.keywords: print( str(k).ljust(24) + " p: " + str(v).ljust(2) + " n: " + str(negative).ljust(3) + " u: " + str(unknown).ljust(3) + " rn: " + str(ratio)[:3] + " ru: " + str(ratio2)[:3]) def tokenize(self, text: str) -> List[str]: """" Tokenizes a text string :param text: string of words to be tokenized :return: list of tokens. """ tokens = [ word for word in word_tokenize(text.lower()) if word.isalpha() ] tokens = list(re.findall(r"[A-Za-z]+", " ".join(tokens))) tokens = [word for word in tokens if word not in self.stops] return tokens def lemmatize_text(self, text: str) -> List[str]: """" Conducts lemmatization of given text :param text: Text to be tokenized and lemmatized :return: array of lemmatized tokens. """ out = [ self.lemmatizer.lemmatize(token) for token in self.tokenize(text) ] self.vocab = self.vocab.union(out) return out def lemmatize_new_text(self, text): lemmatized = [ self.lemmatizer.lemmatize(token) for token in self.tokenize(text) ] out = [] for lem in lemmatized: if lem in self.vocab: out.append(lem) return out def bag_of_words(self, docs: List[str]) -> List[List[int]]: """" Featurization via bag of words. :param docs: Documents (texts) to be BOWed. :return: Feature vector """ return self.count_vectorizer.fit_transform(docs).toarray() def bag_of_important_words_stem(self, docs: List[str], ratioP=0.75, ratioN=0.2) -> List[List[int]]: """" Featurization via bag of words. :param docs: Documents (texts) to be BOWed. :return: Feature vector """ important_words = dict() for item in self.important_words.values(): new_word = self.stemmer.stem(item["word"]) important_words[new_word] = { "r1": item["r1"], "r2": item["r2"], "word": new_word } imp_words = [ word["word"] for word in important_words.values() if word["r1"] > ratioP or word["r2"] < ratioN ] return [[1 if word in doc else 0 for word in imp_words] for doc in docs] def bag_of_important_words_lem(self, docs: List[str], ratioP=0.75, ratioN=0.2) -> List[List[int]]: """" Featurization via bag of words. :param docs: Documents (texts) to be BOWed. :return: Feature vector """ important_words = dict() for item in self.important_words.values(): new_word = self.lemmatizer.lemmatize(item["word"]) important_words[new_word] = { "r1": item["r1"], "r2": item["r2"], "word": new_word } imp_words = [ word["word"] for word in important_words.values() if word["r1"] > ratioP or word["r2"] < ratioN ] return [[1 if word in doc else 0 for word in imp_words] for doc in docs] def tf_idf(self, docs: List[str]) -> csr_matrix: """" Featurization via TF/IDF. :param docs: Documents (texts) to be featurized. :return: Feature vector """ return self.tfdif_vectorizer.fit_transform(docs).toarray() def stem_text(self, text: str) -> List[str]: """" Conducts stemming of given text :param text: Text to be tokenized and stemmed :return: array of stemmed tokens. """ return [self.stemmer.stem(token) for token in self.tokenize(text)] def check_duplicates(self, name, group: List[Commit]): """ Checks a group of duplicates :param name: name of grop (for output) :param group: of commits :return: nothing """ seen = set() not_uniq = set() for x in group: if x not in seen: seen.add(x) else: not_uniq.add(x) for val in not_uniq: print(name + " duplicate: " + val.cmt_hash) def contains(self, group: List[Commit], contained: List[Commit]): """ Checks if the contained commits are really contained in group :param group: to check if contained is in :param contained: commits that should be in group :return: nothing. Console print if contained items NOT in group """ if not all(e in group for e in contained): print(" does not check out") for i in contained: if not i in group: print(" Failed: " + i.cmt_hash)
def main(): print('Welcome to IPOL interpreter!') # returns lines of string containing the cleaned code file_reader = FileReader() # tabs removed, double spaces removed lines = file_reader.read_file() tokenizer = Tokenizer() # returns a 2d list containing the tokens per line of code tokens_list = tokenizer.tokenize(lines) tokens_list_copy = tokens_list.copy() # create instance of the parser with the syntax declared in Syntax class parser = Parser(syntax=Syntax().get_syntax()) # iterate each line of the list containing the tokens for line in tokens_list: recursive_parse(parser, line, callback) # create a new instance of the parser now with the syntax for recuding operations to expressions parser = Parser(syntax=Syntax().get_final_syntax()) # Parse to an expression to see if it is valid for line in parsed_list: recursive_parse(parser, line, callback1) exception_checker = ExceptionCheker() for i in range(len(final_parsed_list)): # there must be a syntax error because it cannot be converted to a single statement # check which kind of exception it is if len(final_parsed_list[i]) > 1: exception = exception_checker.check_exception( final_parsed_list[i], i) if isinstance(exception, IpolException): exceptions.append(exception) # now check if the overall structure of the code is valid # check if there are unused values # for index, token in enumerate(reduce(final_parsed_list)): # if token.type == Type.NUMBER or token.type == Type.STR: # exceptions.append(IpolException( # ExceptionType.UNUSED_VALUE_ERROR, None, index)) # print exceptions if there are any and halt the build process if len(exceptions) > 0: for exception in exceptions: exception.print() return else: # create a new instance of the parser now with the syntax of the overall ipol code parser = Parser(syntax=Syntax().get_ipol_syntax()) # finally, verify that the full code is valid reduced_final_parsed_list = reduce(final_parsed_list) # recursive_parse(parser, reduced_final_parsed_list, callback2) reduced_final_parsed_list[:] = (token for token in reduced_final_parsed_list \ if token.type != Type.EMPTY_LINE) recursive_parse(parser, reduced_final_parsed_list, callback2) for line in ipol_code_verified: for token in line: print(token.type) # check syntax in class Syntax # Type.E means accepted build_failed_message = 'Build Failed.' try: if ipol_code_verified[0][0].type == Type.E: print('Build Successful\n') else: print(build_failed_message) return except: print(build_failed_message) return # there are no exceptions # continue with code generation tokens_list_copy.pop(0) tokens_list_copy.pop(len(tokens_list_copy) - 1) generated_code = CodeGenerator().generate(tokens_list_copy) # this may return a bool data type if isinstance(generated_code, list): runnable_code = '\n'.join(generated_code) runnable_code = runnable_code.replace('&n0', '') # run the generated python code with open('ic.py', '+w') as ic: ic.write(runnable_code) print('\nBuild Complete.\nView logs on ipol_logs.txt\nView generated code on ic.py\n') exec(runnable_code, globals()) with open('ipol_logs.txt', '+w') as logs: text_to_write = 'PARSING LOGS\n\nGENERATED TOKENS\n' for line in tokens_list: for token in line: text_to_write = text_to_write + '{} -> {}'.format(token.type, token.val) + ", " text_to_write = text_to_write + '\n' text_to_write = text_to_write + '\PARSED AS...\n' for line in parsed_list: for token in line: text_to_write = text_to_write + str(token.type) + ', ' text_to_write = text_to_write + '\n' text_to_write = text_to_write + '\nGENERATED INTERMEDIATE CODE\n' + runnable_code logs.write(text_to_write) # if bool is returned, that means there was something wrong with the ipol code else: print('Build failed')
def __init__(self): self.utils = Utils() self.filereader = FileReader()
def read(self, fs): """Basic file reader for single column hex files.""" reader = FileReader(fs, fields=(('values', 'x8'), )) values = reader.read()['values'] self.deserialize(values)
def read(self, fs): """Basic file reader for multiple 32 bit colums hex files.""" reader = FileReader(fs, fields=(('values', 'x8', self.columns), )) self.clear() for column, values in enumerate(reader.read()['values']): self.inject(values, column, 1)
def testfound(self): # fnamelist = ["../wrongdocfile",] # fnamelist = ["../10"+str(v)+".csv" for v in xrange(22,23)] # fnamelist = ["../1125.csv",] fnamelist = ["../10"+str(v)+".csv" for v in xrange(22,32)] + \ ["../110"+str(v)+".csv" for v in xrange(01,10)] + \ ["../11"+str(v)+".csv" for v in xrange(10,31)] + \ ["../120"+str(v)+".csv" for v in xrange(01,10)] + \ ["../12"+str(v)+".csv" for v in xrange(10,23)] cnt = 0 found = 0 wholeresult = {} writeflag = True if writeflag: writefile = open("../cleandata", "w") wrongdocflag = False if wrongdocflag: wrongdocfile = open("../wrongdocfile", "w") wrongdocfile.write(",".join([ '"ALARMHAPPENTIME"', '"ALARMCODE"', '"LOCATION"', '"SUMMARY"' ]) + "\n") missloc = {} print "||||||||||||||||" fnamecnt = open("fnamecnt", "w") for fname in fnamelist: print fname filereader = FileReader(fname) alarmcodeidx = filereader.getattridx("ALARMCODE") attridx = filereader.getattridx("SUMMARY") locidx = filereader.getattridx("LOCATION") timeidx = filereader.getattridx("ALARMHAPPENTIME") identifieridx = filereader.getattridx("NEIDENTIFIER") cntidx = 0 while True: tmptran = filereader.readtransection() cntidx += 1 # print cntidx if tmptran is None: filereader.close() break summary = tmptran[attridx] location = tmptran[locidx] alarmcode = tmptran[alarmcodeidx] identifier = tmptran[identifieridx] warn = Warning(summary, location) if warn.m_type == NOTP4: continue ftword = warn.getfirstword() if ftword not in wholeresult: wholeresult[ftword] = {"cnt": 0, "good": 0} wholeresult[ftword]["cnt"] += 1 cnt += 1 loc = self.m_topo.getnebyidentifier(identifier) if loc is None: loc = warn.fetchloc(self.m_topo) if loc is None: locstr = warn.fetchlocstr() if warn.m_type != NOTP5 and warn.m_type != TP9: if locstr not in missloc: missloc[locstr] = 0 print "===============================================" print warn.m_summary print "----------------------------------" print warn.m_location print "----------------------------------" print identifier print "locstr:", warn.m_type, locstr missloc[locstr] += 1 if wrongdocflag: wrongdocfile.write(",".join(['\"'+v+'\"' for v in \ [tmptran[timeidx],tmptran[alarmcodeidx],tmptran[locidx],tmptran[attridx],]])+"\r\n") continue wholeresult[ftword]["good"] += 1 found += 1 summary = summary.replace("\n", "_") if writeflag: writefile.write(alarmcode + "\t" + loc.m_name + "\t" + summary + "\t" + tmptran[timeidx] + "\n") print fname, "\t", cntidx fnamecnt.write(fname + "\t" + str(cntidx) + "\n") fnamecnt.close() if writeflag: writefile.close() if wrongdocflag: wrongdocfile.close() print "result:" print "cnt:", cnt print "found:", found print "pcg:", found * 1.0 / cnt for v in wholeresult.keys(): if wholeresult[v]["good"] == wholeresult[v]["cnt"]: del wholeresult[v] else: wholeresult[v]["pcg"] = wholeresult[v][ "good"] * 1.0 / wholeresult[v]["cnt"] import pprint pprint.pprint(wholeresult) print "-----------------------" pprint.pprint(missloc) print "missloclen:", len(missloc) json.dump(wholeresult, open("tmpwholeresult", "w")) json.dump(missloc, open("missloc", "w"))
from filereader import FileReader File = FileReader('temperature.log') temperatures = File.get_temperatures() for temperature in sorted(temperatures): print("Day:{} Temps:{}".format(temperature, temperatures[temperature]))
def false_positive(self, partitions: ClusterPoints, clusters: ClusterPoints): return self.sum_of_pairs(clusters) - self.true_positive( partitions, clusters) @staticmethod def sum_of_pairs(cluster_points: ClusterPoints): combinations = 0 for cluster_id in cluster_points.cluster_ids(): cluster_count = cluster_points.points_count(cluster_id) combinations += cluster_count * (cluster_count - 1) return combinations / 2 nmi = ClusterEvaluator(NormalizedMutualInformation(), FileReader("", " ")) jcs = ClusterEvaluator(JaccardSimilarity(), FileReader("", " ")) results = list() for iii in range(1, 6): nmi_result = nmi.evaluate("data/partitions.txt", "data/clustering_" + str(iii) + ".txt") jcs_result = jcs.evaluate("data/partitions.txt", "data/clustering_" + str(iii) + ".txt") print("////// " + str(iii) + " ///////") print(nmi_result) print(jcs_result) print() results.append([nmi_result, jcs_result]) writer = FileWriter("data/scores.txt", " ") writer.write_list_of_rows_to_file(results)
def __init__(self): self.utils = Utils() self.filereader = FileReader() self.parser = Parser() pass