def __init__(self, tickets, start='2019-08-01', interval='1h'): self.tickets = tickets self.start = start self.interval = interval self.dp = DataParser(tickets, start=self.start, interval=self.interval) self.dp.download_data() self.dp.parse_to_week_data() self.data = self.dp.week_data self.budget = 10000.0 self.start_budget = None self.initial_buy_date = None self.buy_n = None self.sell_n = None self.stop_budget = None self.market_first_price = None self.market_money = None self.market_stock_n = None self.market_last_price = None self.market_stop_budget = None self.buy_day = None self.sell_day = None
def Dataset_i(self, i): data_origin = DataParser().AmericanOrigination_i(i) data_monthly = DataParser().AmericanMonthly_i(i) dataset = FeatureExtraction().combine_data_and_origin( data_origin, data_monthly) assert isinstance(dataset, pd.DataFrame) return dataset
def run(self): """ Run the main execution. """ end = False pop = 100 nb_step = 5000 # Get options options = get_options() # Get SUMO binary if options.nogui: sumoBinary = checkBinary('sumo') else: sumoBinary = checkBinary('sumo-gui') # Get number of step if (options.nb_step is not None): nb_step = options.nb_step # Load the Qtable if wanted if options.loadqt: self.qlu.load_q_table() # Traci starts sumo as a subprocess and then this script connects and runs traci.start( [sumoBinary, "-c", "map.sumocfg", "--tripinfo-output", "data.xml"]) # Create agents self.create_rand_agents(pop) traci.simulationStep() # Main loop for i in range(nb_step): print('Step #' + str(i)) traci.simulationStep() self.maintain_rand_agents() if i > 99: if ((i - 100) % 14 == 0): # Create a Agent self.our_agents[Agent.ID] = OurAgent('e', self.qlu) elif ((i - 100) % 7 == 0): # Create a dummy self.dummy_agents[Agent.ID] = InterestingAgent('e') self.control_agents() traci.close() sys.stdout.flush() data_dum = DataParser("data.xml", list(self.dummy_agents.keys())) data_our = DataParser("data.xml", list(self.our_agents.keys())) self.qlu.save_q_table()
def Dataset_i_Reduced(self, i): data_origin = DataParser().AmericanOrigination_i(i) data_monthly = DataParser().AmericanMonthly_i(i) print "Origin : " + str(data_origin.shape) print "Monthly : " + str(data_monthly.shape) fe = FeatureExtraction() fe.TESTING = True dataset = fe.combine_data_and_origin(data_origin, data_monthly) assert isinstance(dataset, pd.DataFrame) return dataset
def Dataset(self): """str: Properties should be documented in their getter method.""" data_origin = DataParser().AmericanOrigination data_monthly = DataParser().AmericanMonthly dataset = FeatureExtraction().combine_data_and_origin( data_origin, data_monthly) assert isinstance(dataset, pd.DataFrame) return dataset
def main(): print "parsing data... this could take a while..." dp = DataParser('posts_news') featuresToResultsAll = dp.getFeatureResultPairs() # calculate weights weights, testData = getWeightsAndTestData(featuresToResultsAll) i = 0 totalError = 0.0 print "\nPrinting Example Results:" for fv, target in testData: i += 1 prediction = predict(weights, fv) if i % 20 == 0: printResults(prediction, target) #error = abs(len(str(prediction)) - len(str(target))) error = abs(prediction - target) totalError += error totalError /= i print "total Error as average difference between prediction and target: %s" % totalError dp.printMostProvocativeWords(50) dp.printMostProvocativeBigrams(50) dp.printMostProvocativeTrigrams(50)
def __parseHeaderKeyValueLine(self, fileLine, dict): key = "" values = [] lastIdx = 0 currIdx = 0 while currIdx < len(fileLine): item = fileLine[currIdx] if item == ";": if lastIdx < currIdx: if lastIdx == 0: key = fileLine[lastIdx:currIdx] else: values.append(fileLine[lastIdx:currIdx]) lastIdx = currIdx + 1 currIdx += 1 if lastIdx < currIdx: values.append(fileLine[lastIdx:currIdx]) idx = 0 while len(values) > idx: if key == HeaderParser.Head_Key_Order: values[idx] = int(values[idx]) elif key == HeaderParser.Head_Key_PrePressure: values[idx] = float( DataParser.checkAndReplaceComma(self, values[idx])) elif key == HeaderParser.Head_Key_Furnace_Temp: values[idx] = int(values[idx]) elif key == HeaderParser.Head_Key_Part_Weight: values[idx] = int(values[idx]) elif key == HeaderParser.Foot_Key_Amount: values[idx] = int(values[idx]) elif key == HeaderParser.Foot_Key_Order_Set: values[idx] = int(values[idx]) elif key == HeaderParser.Foot_Key_Order_Cnt: values[idx] = int(values[idx]) elif key == HeaderParser.Foot_Key_Charge_Set: values[idx] = int(values[idx]) #elif key == HeaderParser.Foot_Key_Charge_Cnt: #values[idx] = int(values[idx]) elif key == HeaderParser.Foot_Key_Cycle_Brutto: values[idx] = float( DataParser.checkAndReplaceComma(self, values[idx])) elif key == HeaderParser.Foot_Key_Cycle_Netto: values[idx] = float( DataParser.checkAndReplaceComma(self, values[idx])) dict[key] = values idx += 1
def __init__(self): #实例化同级类 self.parser = DataParser() # 建立3个url队列:未下载、正在下载、完成下载 self.beforedownloadset = set() self.beingdownloadset = set() self.afterdownloadset = set()
def run(self): # Connect to db, receive db connection Object db_connect = DBConnect() db_connect.connect() db = db_connect.get_connection() """ #Start reading the file, receive results list with data fileReader = FileReader(self.url) fileReader.open_and_read_file() result = fileReader.getResults() """ # Setting up a class object and connecting. unix_reader = UNIXReader("src/client/upload_stream.sock") unix_reader.connect() # Data parser class object to parse receieved data data_parser = DataParser() # Query class object getting ready to query database query_db = QueryDB(db) query_db.find_trip_id() print 'Running main loop' while True: # Receive data from unix reader object data = unix_reader.revc_socket() json_data = json.loads(data) print(json_data) # Parse the result data to appropriate format if json_data: sorted_results = data_parser.parseToDict(json_data) if db_connect.check_connection(): # Send data to database query_db.query(sorted_results) else: db_connect.connect() # Send data to database query_db.query(sorted_results) # Close connection to database db_connect.disconnect()
def main(): # ======= COLLECT DATA ======= # set to true if need to collect data if False: dp = DataParser() dp.readData() # ======= READ IN DATA ======= df = pd.DataFrame() for i in range(0, 5): df = df.append( pd.read_csv("data/vancouver_data_{0}.csv".format(i), sep="\t", index_col=False)) # ======= PROCESS DATA ======= dl = DeepLearning(df.values) dl.process()
def __init__(self, ipAddress): self.hardData = DataParser("./Data.json") self.mqtt_sub = mqtt.Client("Listener-Composition") self.mqtt_pub = mqtt.Client("Writer-Composition") self.ipAddress = ipAddress self.stateVariables = StateVaribles() self.prevStateVariables = StateVaribles() self.actionVariables = ActionVariables() # MQTT self.mqtt_sub.on_message = self.on_context_message self.mqtt_sub.connect(self.ipAddress, 1883, 70) self.mqtt_sub.subscribe("Context/#", qos=2) self.mqtt_pub.connect(self.ipAddress, 1883, 70) self.mqtt_pub.loop_start()
def Dataset(self): """str: Properties should be documented in their getter method.""" raw_data_set = DataParser().GermanCredit dataset = raw_data_set # dataset = self.format_data(raw_data_set) dataset = FeatureExtraction().apply_all(dataset) assert isinstance(dataset, pd.DataFrame) return dataset
def Dataset(self): """str: Properties should be documented in their getter method.""" # raw_data_set = DataParser().GermanCredit # dataset = FeatureExtraction().apply_all(raw_data_set) dataset = DataParser().replicateDataLendingClubProcessed assert isinstance(dataset, pd.DataFrame) return dataset
def main(): if len(sys.argv) != 4: print 'Missing file operands!' print 'PerfDataViewer.py [program] [input_data_file] [output_data_file]' return program = str(sys.argv[1]) inFileName = str(sys.argv[2]) outFileName = str(sys.argv[3]) #TODO: move these to configuration files eventsList = ['0x149','0x151','0x2a2','0x126','0x227','0x224','0x8a2','0x1b0','0x20f0','0x2f1','0x1f2','0x1b8','0x2b8','0x4b8','0x40cb'] ppc_eventsList = ['0x3c046','0x2c048','0x2f080','0x26080','0x30881','0x26182','0x26880','0xd0a2','0xd0a0'] arffHeader = ['@relation function_level_badfs_badma_good\n',\ '@attribute r0149 numeric\n','@attribute r0151 numeric\n','@attribute r02a2 numeric\n','@attribute r0126 numeric\n',\ '@attribute r0227 numeric\n','@attribute r0224 numeric\n','@attribute r08a2 numeric\n','@attribute r01b0 numeric\n',\ '@attribute r20f0 numeric\n','@attribute r02f1 numeric\n','@attribute r01f2 numeric\n','@attribute r01b8 numeric\n',\ '@attribute r02b8 numeric\n','@attribute r04b8 numeric\n','@attribute r40cb numeric\n','@attribute status {good, badfs, badma}\n',\ '@data\n'] ppc_arffHeader = ['@relation function_level_badfs_badma_good\n',\ '@attribute r3c046 numeric\n', '@attribute r2c048 numeric\n', '@attribute r2f080 numeric\n',\ '@attribute r26080 numeric\n', '@attribute r30881 numeric\n', '@attribute r26182 numeric\n',\ '@attribute r26880 numeric\n', '@attribute rd0a2 numeric\n', '@attribute rd0a0 numeric\n',\ '@attribute status {good, badfs, badma}\n', '@data\n'] perfData = PerfData() perfFileReader = FileReader(inFileName) dataParser = DataParser(perfFileReader, perfData, program) eventsHolder = EventsHolder(eventsList) eventsHolder.setInstructionCountRawEvent('0xc0') arffWriter = ArffWriter(outFileName,arffHeader) dataWriter = DataWriter(arffWriter, perfData, eventsHolder) dataParser.parse() print(perfData.getDataStore()) dataWriter.writeToArffFile() print outFileName + ' file was created successfully.'
def get_info_A(self): def get_data_i_FE2(i): print "Read" + str(i) return DataParser().AmericanCombo_i_FE2(i) def write_out_data(df, i): print "Writing out data" DataParser()._write_HDFStore_Combined_FE2(df, i) data_count = DataParser().number_of_datasets i = 0 df_ALL = pd.DataFrame() df_ALL2 = pd.DataFrame() LOAN_COUNT = 0 FICO_MEAN = 0 FICO_MEDIAN = [] BALANCE_MEAN = 0 while i < data_count: df_X = get_data_i_FE2(i) df_X = df_X.sort_values("id_loan") df_X.reset_index(drop=True, inplace=True) n = 20000 if len(df_X) < n: n = len(df_X) df_ALL = pd.concat([df_X[:n], df_ALL], axis=0) m = 5000 if len(df_X) < m: m = len(df_X) df_ALL2 = pd.concat([df_X[:m], df_ALL2], axis=0) # LOAN_COUNT += len(np.unique(df_X['id_loan'].values)) # FICO_MEAN += df_X['fico'].mean() # BALANCE_MEAN += df_X['orig_upb'].mean() # FICO_MEDIAN.append(df_X['fico'].median()) # FICO_MEDIAN.append(df_X['fico'].median()) i += 1 FICO_MEAN = FICO_MEAN / i FICO_MEDIAN = np.median(FICO_MEDIAN) print "FICO_MEDIAN: " + str(FICO_MEDIAN) print "FICO_MEAN: " + str(FICO_MEAN) print "LOAN_COUNT: " + str(LOAN_COUNT) print "BALANCE_MEAN: " + str(BALANCE_MEAN) write_out_data(df_ALL, -1) write_out_data(df_ALL2, -2)
def ComputePrecisionK(modelfile, testfile, K_list): maxParagraphLength = 10 maxParagraphs = 4 #nlabels=1001 #vocabularySize=76391 labels = 8 vocabularySize = 244 model = Model(maxParagraphLength, maxParagraphs, labels, vocabularySize) testing = DataParser(maxParagraphLength, maxParagraphs, labels, vocabularySize) print(testfile) testing.getDataFromfile(testfile) print("data loading done") print("no of test examples: " + str(testing.totalPages)) model.load(modelfile) print("model loading done") batchSize = 1 testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) precAtK = {} for itr in K_list: precAtK[itr] = 0 for i, v in enumerate(pred): temp = [(labId, labProb) for labId, labProb in enumerate(v)] # print(temp) temp = sorted(temp, key=lambda x: x[1], reverse=True) for ele in K_list: pBag = 0 for itr in range(ele): if truePre[i][0][temp[itr][0]] == 1: pBag += 1 # print(float(pBag)/float(ele)) precAtK[ele] += float(pBag) / float(ele) f = open("results/precAtK_model3_n", "w") for key in sorted(precAtK.keys()): # print(key, precAtK[key]/len(pred)) print(precAtK[key] / len(pred)) f.write(str(key) + "\t" + str(precAtK[key] / len(pred)) + "\n") f.close()
def ComputePrecisionK(modelfile, testfile, K_list): CURRENT_DIR = os.path.dirname(os.path.abspath("./WikiCategoryLabelling/")) sys.path.append(os.path.dirname(CURRENT_DIR + "/WikiCategoryLabelling/")) maxParagraphLength = 250 maxParagraphs = 10 labels = 1001 vocabularySize = 76390 model = Model(maxParagraphLength, maxParagraphs, labels, vocabularySize) testing = DataParser(maxParagraphLength, maxParagraphs, labels, vocabularySize) testing.getDataFromfile(testfile) print("data loading done") print("no of test examples: " + str(testing.totalPages)) model.load(modelfile) print("model loading done") batchSize = 10 testing.restore() truePre = [] pred = [] for i in range(math.ceil(testing.totalPages / batchSize)): if i < testing.totalPages / batchSize: data = testing.nextBatch(batchSize) else: data = testing.nextBatch(testing.totalPages % batchSize) truePre.extend(data[0]) pre = model.predict(data) pred.extend(pre[0].tolist()) avgPrecK = [0] * len(K_list) for i, p in enumerate(pred): sortedL = sorted(range(len(p)), key=p.__getitem__, reverse=True) for k, K in enumerate(K_list): labelK = sortedL[:K] precK = 0 for l in labelK: if truePre[i][l] == 1: precK += 1 avgPrecK[k] += precK / float(K) avgPrecK = [float(a) / len(pred) for a in avgPrecK] for p in avgPrecK: print(str(p))
def __init__(self): #实例化其他几个功能模块 self.urlgen = URLGenerator() self.downloader = Downloader() self.parser = DataParser() self.datastore = DataStore() #建立3个url队列:未下载、正在下载、完成下载 self.beforedownloadset = set() self.beingdownloadset = set() self.afterdownloadset = set() #设定种子url self.seedurl = 'https://so.gushiwen.org/authors/'
def main(): """Parse and graph data provided in Data(Relevant).csv file.""" # Take file name as raw string data_file = r'Data(Relevant).csv' # Regular expression that only accepts strings with valid DC # names 'I', 'A', and 'S'. my_regex = r'^[IAS]$' # List to store user inputed data center names dc_name_list = [] print('Enter data centers to be graphed') while True: # Loop until user enters valid Data center name user_input = input(">> ") # Checking if user_input matches defined regular expression if re.search(my_regex, user_input): dc_name_list.append(user_input) # Breaking from loop when user enters nothing. elif user_input == '': break else: print("Valid dc_name_list: 'I' 'A' 'S'") # Modifying data center name list to only have unique dc names dc_name_list = set(dc_name_list) # Creating data parser to parse data_file, returning records # corresponding to the data centers specified in the dc_name_list dc_data_parser = DataParser(data_file, dc_name_list) dc_dataset = dc_data_parser.get_dataset() # Creating a DatasetDisplay object for plotting and displaying # the data center dataset. dc_data_display = DatasetDisplay(dc_dataset, dc_name_list) dc_data_display.show_plot()
def parseKeyValueFile(path): # create the variables headerSection = False footerSection = False dataSection = False head = HeaderParser() foot = HeaderParser() data = DataParser() headerDict = {} footerDict = {} dataList = [] # open file and file = open(path, 'r') # go through all lines for line in file: # remove CR/LF line = line[:-1] # Check for Header Section line, headerSection, footerSection, dataSection = checkSectionTrigger( line, headerSection, footerSection, dataSection) if headerSection == True: head.parseHeaderLine(line, headerDict) # Check for Footer Section elif footerSection == True: foot.parseHeaderLine(line, footerDict) elif dataSection == True: data.parseDataLine(line, dataList) return headerDict, footerDict, dataList
def genAnalysis(modelfile,testfile,outputfile): maxParagraphLength = 20 maxParagraphs = 5 filterSizes = [1] num_filters = 64 wordEmbeddingDimension = 30 lrate = float(1e-3) labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,filterSizes,num_filters,wordEmbeddingDimension,lrate) testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) batchSize = 1 testing.restore() truePre=[] pred=[] for itr in range(testing.totalPages): data=testing.nextBatch(1) truePre.append(data[0]) pre=model.predict(data) pred.append(pre[0]) labelIDName = open("../labelId-labelName-full.txt").read().split("\n") labelIDName = [ [ int(x.split("\t")[0]) , x.split("\t")[1].rstrip() ] for x in labelIDName] # print(labelIDName) #making it a dictionary labelName = dict(labelIDName) # print(labelName[9026]) f = open(outputfile,"w") for i,v in enumerate(pred): temp = [(labId,labProb) for labId,labProb in enumerate(v) ] temp = sorted(temp,key=lambda x:x[1],reverse=True) #sorting based on label probability to get top k predLabel = [0]*len(temp) output = "" for itr in range(11): predLabel[temp[itr][0]] = 1 if truePre[i][0][temp[itr][0]] == 1: output = output + "," + labelName[temp[itr][0]] f.write(str(i) + "," + output + "\n") f.close()
def __init__(self): super(MyDlg, self).__init__() # Set up the user interface from Designer. self.ui = Ui_Dialog() self.ui.setupUi(self) self.setWindowTitle("Lambda(λ) debugger Tool by EricWang") view = CmderView() self.init_view(view, self.ui) self.m_ctrller = CmderCtrller(view) self.m_ctrller.m_dataParser = DataParser() self.m_ctrller.m_keyPassCtrlExt = KeyPassCtrlExt() extCmd = ExtendCmd() self.m_ctrller.m_extCmdColls = extCmd.get_cmd_colls() self.m_ctrller.bind_cmd_set() self.make_event()
class Main(): if __name__ == '__main__': # This handles Twitter authentication and the connection to Twitter API # The API Key Information are stored in data/keys.json with open('../data/keys.json') as json_file: data = json.load(json_file) auth = OAuthHandler(data['consumer_key'], data['consumer_secret']) auth.set_access_token(data['access_token'], data['access_token_secret']) # Creates a new SearchAPI object and gets the previous tweets # searchAPI = SearchAPI() # searchAPI.getAccountsFollowers(auth, "KMbappe") dp = DataParser() nbFollowers = len(dp.getAllFollowers()) allTweets = dp.getAllTweets() week = Week() week.fill_week(allTweets) week.get_days_activity(nbFollowers) week.tuesday.getActivityOfTheHours()
def __init__(self): self.dataParser = DataParser.getInstance() self.dataParser.LoadDataFile("inputData.xml") self.plants = [] self.grazers = [] self.predators = [] self.obstacles = [] dispatcher.connect(self.HandlePlantDeath, signal=EventSignals.plantDeath, sender=dispatcher.Any) dispatcher.connect(self.HandlePlantCreation, signal=EventSignals.plantCreation, sender=dispatcher.Any) dispatcher.connect(self.HandlePredatorDeath, signal=EventSignals.predatorDeath, sender=dispatcher.Any) dispatcher.connect(self.HandlePredatorCreation, signal=EventSignals.predatorCreation, sender=dispatcher.Any) dispatcher.connect(self.HandlePredatorSensing, signal=EventSignals.predatorSensing, sender=dispatcher.Any) dispatcher.connect(self.HandleOrganismDeath, signal=EventSignals.organismDeath, sender=dispatcher.Any) dispatcher.connect(self.HandleGrazerCreation, signal=EventSignals.grazerCreation, sender=dispatcher.Any) dispatcher.connect(self.HandleGrazerSensing, signal=EventSignals.grazerSensing, sender=dispatcher.Any) dispatcher.connect(self.HandleGrazerDeath, signal=EventSignals.grazerDeath, sender=dispatcher.Any)
import pickle from DataParser import DataParser from CronsDataStructure import DataExplorer data_obj = DataParser() data_obj.fetch_cron_data() data = data_obj.get_cron_data() DataSet = {} for node in data: DataSet[node] = {} dex = DataExplorer(node_name=node, cron_data=data[node]['cron_data'], duration_data=data[node]['duration']) dex.populate_schedule_table() node_schedule_table = dex.get_schedule_data() DataSet[node] = node_schedule_table with open( "C:\\mydata\\Bits\\Courses\\4thSem\\Dissertation\\repository\\data.pickle", 'wb') as f: pickle.dump(DataSet, f)
def main(_): global dataParser, task_name, popped_name os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_device tf.logging.set_verbosity(tf.logging.INFO) tf.logging.info('model_name: %s' % FLAGS.model_name) tf.logging.info('data_dir: %s' % FLAGS.data_dir) tf.logging.info('model_dir: %s' % FLAGS.model_dir) tf.logging.info('task name: %s' % FLAGS.task_name) tf.logging.info('learning_rate: %s' % FLAGS.learning_rate) tf.logging.info('batch_size: %s' % FLAGS.batch_size) tf.logging.info('num_epochs: %s' % FLAGS.num_epochs) tf.logging.info('gpu_device: %s' % FLAGS.gpu_device) task_name = FLAGS.task_name if task_name == 'finish': popped_name = 'like' elif task_name == 'like': popped_name = 'finish' else: raise Exception('Only support finish or like task.') dataParser = DataParser(track_name=FLAGS.track_name, data_dir=FLAGS.data_dir) dataParser.load_user_behavior() # dataParser.load_conversion_rate() print( "=========================== Feature Size: %d ============================" % dataParser.feature_length) params = None if FLAGS.model_name in ['deep_fm', 'xdeepfm']: params = { 'embedding_size': FLAGS.fm_embedding_size, 'feature_field_size': dataParser.field_size, 'feature_size': dataParser.feature_length, 'hidden_units': [200, 100, 75, 50, 25], 'cin_layer_size': [50, 50, 50, 50], 'word_size': dataParser.word_size, 'word_field_size': 35, 'item_size': dataParser.feature_dict['item_id'], 'item_field_size': 400, 'author_size': dataParser.feature_dict['author_id'], 'author_field_size': 400, 'music_size': dataParser.feature_dict['music_id'] + 1, 'music_field_size': 400, 'item_city_size': dataParser.feature_dict['item_city'] + 1, 'item_city_field_size': 400, 'video_size': 128, 'audio_size': 128, 'video_field_size': 128, 'audio_field_size': 128, 'item_uid_size': dataParser.feature_dict['uid'], 'item_uid_field_size': 150, 'author_uid_size': dataParser.feature_dict['uid'], 'author_uid_field_size': 500, 'music_uid_size': dataParser.feature_dict['uid'], 'music_uid_field_size': 500, 'learning_rate': FLAGS.learning_rate, 'dropout_rate': FLAGS.dropout_rate, 'batch_size': FLAGS.batch_size, } tf.logging.info('deep_fm params: ', params) if FLAGS.action == 'train': train(params) elif FLAGS.action == 'evaluate': evaluate(params) elif FLAGS.action == 'predict': tf.logging.info('predict_output_path: %s' % FLAGS.predict_output_path) predict(params) elif FLAGS.action == 'train_evaluate': train_and_evaluate(params) else: raise Exception( 'The action %s is unsupported. Only support train, evaluate, predict, train_evaluate.' % FLAGS.action)
#from DataParser_siml import DataParser_siml as DataParser #from model2_siml import Model2_siml as Model from DataParser import DataParser as DataParser from model3 import Model3 as Model maxParagraphLength = 100 maxParagraphs = 1 #nlabels=1001 #vocabularySize=76391 nlabels = 8 vocabularySize = 244 training = DataParser(maxParagraphLength, maxParagraphs, nlabels, vocabularySize) #training.getDataFromfile("data/wiki_fea_76390_Label_1000_train") training.getDataFromfile( "C:/gitrepo/Wiki-Text-Categorization/Distant Supervision/Reuter_dataset/reuters_sparse_training.txt" ) model = Model(maxParagraphLength, maxParagraphs, nlabels, vocabularySize) batchSize = 64 epoch = 0 epochEnd = 105 for e in range(epoch, epochEnd): print('Epoch: ' + str(e + 1)) cost = 0 for itr in range(int(training.totalPages / batchSize)): cost += model.train(training.nextBatch(batchSize)) print(str(cost / training.totalPages))
def __init__(self): #实例化同级类 self.parser = DataParser() self.datastore = DataStore()
class Downloader(object): #建立类成员 def __init__(self): #实例化同级类 self.parser = DataParser() self.datastore = DataStore() # 1.通用的网页请求抓取 def get_html(self, url): try: # 使用随机User-Agent ua = UserAgent() req_headers = {'User-Agent': ua.random} res = requests.get(url, headers=req_headers) if res.status_code == requests.codes.ok: html = res.text return html return '' except Exception as e: return e # 2.下载指定作者的所有作品 def downloadworks_oneauthor(self, start_url, authorinfotuple): # 1)提取作者信息,并设置请求的完整url和结果记录文件名 pagenum = 1 authorid = authorinfotuple[0] authorname = authorinfotuple[1] # 2)组成目标页面URL,循环爬行当前作者全部诗文 personalworks_hommeurl = start_url + 'page=%s&id=%s' % (str(pagenum), authorid) # 3)遍历所有页面,下载并保存到文件中 try: # i.爬取个人作品首页,提取总页数 works_html = self.get_html(personalworks_hommeurl) pagecount = self.parser.getpagecount(works_html) # ii.创建文档,写入基本信息 totalinfo = u'\n作者:{name},页数:{pagecount}\r\n'.format( name=authorname, pagecount=pagecount) path = u'作品集' filename = path + '\\' + authorname + '.txt' self.datastore.createfile_oneauther(filename, path, totalinfo) # iii.遍历作者所有作品页,提取诗文保存到指定文档 for i in range(1, pagecount + 1): #组合每一页的url page_url = start_url + 'page=%s&id=%s' % (str(i), authorid) #请求抓取当前诗文页面 time.sleep(random.randint(3, 6)) singlepageworks_html = self.get_html(page_url) if len(works_html) > 0: # 提取当前页中所有诗文 titlelist, contentlist = self.parser.getworks_singlepage( singlepageworks_html) # 写入文档 self.datastore.storeworks_singlepage( filename, i, titlelist, contentlist) return 'finished' except Exception as e: return e
def ComputeFscore(modelfile, testfile, outputfile): labels = 8 vocabularySize = 244 regLambda = float(sys.argv[1]) model = Model(labels, vocabularySize, regLambda) testing = DataParser(labels, vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") testing.restore() truePre = [] pred = [] for itr in range(testing.totalPages): data = testing.nextBatch(1) truePre.append(data[0]) pre = model.predict(data) pred.append(pre[0]) labelsCount = {} ConfusionMa = {} fScr = {} thres = 0.5 valid = int( len(truePre) * 0.5 ) #using first 50% data for threshold tuning - we have merged test and cv files labelsCount = {} ConfusionMa = {} fScr = {} thresLab = {} for la in range(labels): if la % 25 == 0: print("Current label", la) t = [] p = [] for i in range(valid): t.append(truePre[i][0][la]) p.append(pred[i][la]) bestF, bestThre = thresholdTuning(t, p) t = [] p = [] for i in range(valid, len(truePre)): t.append(truePre[i][0][la]) p.append(pred[i][la]) p = np.array(p) fScr[la] = f1_score(t, p >= bestThre) ConfusionMa[la] = confusion_matrix(t, p > bestThre) thresLab[la] = bestThre f = open(outputfile, "a") output = sys.argv[5] sum_fscore = 0.0 for i in range(labels): sum_fscore = sum_fscore + fScr[i] output = output + "," + str(fScr[i]) output += "," + str(sum_fscore / float(labels - 1)) print("Fscore at " + sys.argv[3] + " epochs: " + str(sum_fscore / float(labels - 1))) f.write(output + "\n") f.close()
def analyse(modelfile,testfile,outputfile): maxParagraphLength = 20 maxParagraphs = 10 filterSizes = [2,3,4] num_filters = 64 wordEmbeddingDimension = 100 lrate = float(0.001) poolLength = 2 labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,\ filterSizes,num_filters,wordEmbeddingDimension,lrate,poolLength) testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) batchSize = 1 testing.restore() truePre=[] pred=[] for itr in range(testing.totalPages): data=testing.nextBatch(1) truePre.append(data[0]) pre=model.predict(data) pred.append(pre[0]) labelids = open("../../dataset/sorted_labelid_sans5toplabels.txt","r").read().strip().split("\n") labelids = [ int(x) for x in labelids ] no_of_partition = 10 partition_size = labels / no_of_partition rank1 = [0]*no_of_partition rank3 = [0]*no_of_partition rank5 = [0]*no_of_partition for i,v in enumerate(pred): temp = [(labId,labProb) for labId,labProb in enumerate(v) ] temp = sorted(temp,key=lambda x:x[1],reverse=True) #sorting based on label probability to get top k rank1[ labelids.index( temp[0][0] ) / partition_size ] += 1 rank3[ labelids.index( temp[0][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[0][0] ) / partition_size ] += 1 rank3[ labelids.index( temp[1][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[1][0] ) / partition_size ] += 1 rank3[ labelids.index( temp[2][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[2][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[3][0] ) / partition_size ] += 1 rank5[ labelids.index( temp[4][0] ) / partition_size ] += 1 rank1 = [ ( float(x) /testing.totalPages )*100 for x in rank1 ] rank3 = [ ( float(x) /( 3 * testing.totalPages) )*100 for x in rank3 ] rank5 = [ ( float(x) /( 5 * testing.totalPages) )*100 for x in rank5 ] print( rank1) print( rank3) print(rank5) filePtr = open( outputfile , "w") for i in rank1: filePtr.write( str(i) + "," ) filePtr.write("\n") for i in rank3: filePtr.write( str(i) + "," ) filePtr.write("\n") for i in rank5: filePtr.write( str(i) + "," ) filePtr.close()
def ComputePrecisionK(modelfile,testfile,outputfile): maxParagraphLength = int(sys.argv[1]) maxParagraphs = int(sys.argv[2] ) filterSizes = [int(i) for i in sys.argv[3].split("-")] num_filters = int(sys.argv[4]) wordEmbeddingDimension = int(sys.argv[5]) lrate = float(sys.argv[10]) keep_prob = 1.0 labels = 30938 vocabularySize = 101939 model = Model(maxParagraphs,maxParagraphLength,labels,vocabularySize,\ filterSizes,num_filters,wordEmbeddingDimension,lrate, keep_prob) testing = DataParser(maxParagraphs,maxParagraphLength,labels,vocabularySize) testing.getDataFromfile(testfile) model.load(modelfile) print("loading done") print("no of test examples: " + str(testing.totalPages)) print("Computing Prec@k") #check if batchsize needs to be taken by parameter batchSize = 1 testing.restore() truePre=[] pred=[] for itr in range(testing.totalPages): data=testing.nextBatch(1) truePre.append(data[0]) pre=model.predict(data) pred.append(pre[0]) K_list = [1,3,5] #prec@1 .....prec@NoofLabels precAtK = [0.0]*6 # #As need to get Prec only on last 50% of test data as first 50% is for cross validation # valid=int(len(truePre)*0.5) # pred = pred[valid:] # truePre = truePre[valid:] for i,v in enumerate(pred): temp = [(labId,labProb) for labId,labProb in enumerate(v) ] temp = sorted(temp,key=lambda x:x[1],reverse=True) #sorting based on label probability to get top k for ele in K_list: #1....No of Labels pBag = 0 #no of true positive for this instance for itr in range(ele): #top k ie top ele if truePre[i][0][temp[itr][0]]==1: precAtK[ele] += 1 # pBag += 1 # precAtK[ele] += float(pBag)/float(ele) f = open(outputfile,"a") output = sys.argv[9] for k in K_list: precAtK[k] /= (k * len(pred)) print ("Prec@" + str(k) + " = " + str(precAtK[k])) output = output + "," + "Prec@" + str(k) + "=," + str(precAtK[k]) f.write(output + "\n") f.close()
__author__ = 'jszheng' import sys from antlr4 import * from antlr4.InputStream import InputStream from DataLexer import DataLexer from DataParser import DataParser if __name__ == '__main__': if len(sys.argv) > 1: input_stream = FileStream(sys.argv[1]) else: input_stream = InputStream(sys.stdin.read()) lexer = DataLexer(input_stream) token_stream = CommonTokenStream(lexer) parser = DataParser(token_stream) tree = parser.top() lisp_tree_str = tree.toStringTree(recog=parser) print(lisp_tree_str)
def __init__(self, trainFile, targetFile): dp = DataParser(trainFile) self.ePtm, self.sPtm, self.initVect = dp.computeProbabilities() self.trueStates, self.emissions = self.parseTarget(targetFile)