def main(): oparser = argparse.ArgumentParser( description="intelligent crawling with q-learning") oparser.add_argument( "--config-file", dest="configFile", required=True, help="Path to config file (containing MySQL login etc.)") options = oparser.parse_args() np.random.seed() np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666) sqlconn = MySQL(options.configFile) hostName = "http://vade-retro.fr/" # hostName = "http://www.buchmann.ch/" start = time.time() env = Env(sqlconn, hostName) end = time.time() print('Time to build the graph took: ', end - start, ' seconds.') # crawl_method1(sqlconn, env) start = time.time() crawl_method2(sqlconn, env) end = time.time() print('Time to crawl the graph took: ', end - start, ' seconds.')
def PickleDomain(url): print("Pickling", url) domain = extract(url).domain if not os.path.exists('pickled_domains/' + domain): sqlconn = MySQL('config.ini') env = Env(sqlconn, url) with open('pickled_domains/' + domain, 'wb') as f: pickle.dump(env, f) print("Done {}".format(domain))
def main(): global DEBUG oparser = argparse.ArgumentParser( description="intelligent crawling with q-learning") oparser.add_argument( "--config-file", dest="configFile", required=True, help="Path to config file (containing MySQL login etc.)") oparser.add_argument( "--language-pair", dest="langPair", required=True, help="The 2 language we're interested in, separated by ,") options = oparser.parse_args() np.random.seed() np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666) sqlconn = MySQL(options.configFile) languages = Languages(sqlconn.mycursor) langPairList = options.langPair.split(",") assert (len(langPairList) == 2) langIds = [ languages.GetLang(langPairList[0]), languages.GetLang(langPairList[1]) ] #hostName = "http://vade-retro.fr/" #hostName = "http://www.buchmann.ch/" hostName = "http://www.visitbritain.com/" env = Env(sqlconn, hostName) #DEBUG = True arrNaive = naive(sqlconn, env, len(env.nodes)) arrBalanced = balanced(sqlconn, env, len(env.nodes), langIds) #print("arrNaive", arrNaive) #print("arrBalanced", arrBalanced) plt.plot(arrNaive, label="naive") plt.plot(arrBalanced, label="balanced") plt.legend(loc='upper left') plt.show()
def GetEnv(configFile, languages, url): domain = extract(url).domain filePath = 'pickled_domains/'+domain if not os.path.exists(filePath): print("mysql load", url) sqlconn = MySQL(configFile) env = Env(sqlconn, url) else: print("unpickle", url) with open(filePath, 'rb') as f: env = pickle.load(f) # change language of start node. 0 = stop env.nodes[sys.maxsize].lang = languages.GetLang("None") print(" ", len(env.nodes), "nodes,", env.numAligned, "aligned docs") #for node in env.nodes.values(): # print(node.Debug()) print("env created", url) return env
def main(): global TIMER TIMER = Timer() oparser = argparse.ArgumentParser(description="intelligent crawling with q-learning") oparser.add_argument("--config-file", dest="configFile", required=True, help="Path to config file (containing MySQL login etc.)") oparser.add_argument("--language-pair", dest="langPair", required=True, help="The 2 language we're interested in, separated by ,") oparser.add_argument("--save-dir", dest="saveDir", default=".", help="Directory that model WIP are saved to. If existing model exists then load it") oparser.add_argument("--save-plots", dest="saveDirPlots", default="plots", help="Directory ") oparser.add_argument("--delete-duplicate-transitions", dest="deleteDuplicateTransitions", default=False, help="If True then only unique transition are used in each batch") oparser.add_argument("--n-hosts-train", dest="n_train", type=int, default=1, help="If True then only unique transition are used in each batch") oparser.add_argument("--m-hosts-test", dest="m_test", type=int, default=1, help="If True then only unique transition are used in each batch") options = oparser.parse_args() np.random.seed(99) np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666) sqlconn = MySQL(options.configFile) languages = Languages(sqlconn.mycursor) #["http://vade-retro.fr/",] # hostNames_train = ["http://vade-retro.fr/"] hostNames_test = ["http://vade-retro.fr/"] #hostNames_train = ["http://www.buchmann.ch/"] #hostNames_test = ["http://www.visitbritain.com/"] #hostNames_train = ["http://carta.ro/","http://www.bachelorstudies.fr/", "http://www.buchmann.ch/", "http://chopescollection.be/", "http://www.visitbritain.com/", "http://www.burnfateasy.info/"] #allhostNames[0:options.n_train] #hostNames_test = ["http://www.lavery.ca/",] #allhostNames[options.n_train:options.n_train+options.m_test] if not os.path.exists(options.saveDirPlots): os.mkdir(options.saveDirPlots) print("Training hosts are:") for h in hostNames_train: print(h) print() print("Testing hosts are:") for h in hostNames_test: print(h) print() with open('{}/hosts.info'.format(options.saveDirPlots), 'w') as f: f.write('Training hosts are:\n') for h in hostNames_train: f.write(h+'\n') f.write('\nTesting hosts are:\n') for h in hostNames_test: f.write(h+'\n') params = LearningParams(languages, options.saveDir, options.saveDirPlots, options.deleteDuplicateTransitions, options.langPair) env_train_dic = {hostName:Env(sqlconn, hostName) for hostName in hostNames_train} env_test_dic = {hostName:Env(sqlconn, hostName) for hostName in hostNames_test} for dic in [env_train_dic, env_test_dic]: for hostName, env in env_test_dic.items(): env.maxLangId = MAX_LANG_ID env.nodes[sys.maxsize].lang = languages.GetLang("None") dic[hostName] = env tf.reset_default_graph() qns = Qnets(params, MAX_LANG_ID) #qns_test = Qnets(params, env_test) init = tf.global_variables_initializer() saver = None #tf.train.Saver() with tf.Session() as sess: sess.run(init) totRewards, totDiscountedRewards = Train(params, sess, saver, env_train_dic, qns, env_test_dic)
def main(): global TIMER TIMER = Timer() oparser = argparse.ArgumentParser(description="intelligent crawling with q-learning") oparser.add_argument("--config-file", dest="configFile", required=True, help="Path to config file (containing MySQL login etc.)") oparser.add_argument("--language-pair", dest="langPair", required=True, help="The 2 language we're interested in, separated by ,") oparser.add_argument("--save-dir", dest="saveDir", default=".", help="Directory that model WIP are saved to. If existing model exists then load it") oparser.add_argument("--save-plots", dest="saveDirPlots", default="plot", help="Directory ") oparser.add_argument("--delete-duplicate-transitions", dest="deleteDuplicateTransitions", default=False, help="If True then only unique transition are used in each batch") options = oparser.parse_args() np.random.seed() np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666) sqlconn = MySQL(options.configFile) languages = Languages(sqlconn.mycursor) params = LearningParams(languages, options.saveDir, options.saveDirPlots, options.deleteDuplicateTransitions, options.langPair) #hostName = "http://www.visitbritain.com/" # hostName = "http://www.buchmann.ch/" #hostName = "http://vade-retro.fr/" # smallest domain for debugging hostName_test = "http://www.visitbritain.com/" #hostName_test = "http://www.buchmann.ch/" #hostName_test = "http://vade-retro.fr/" # smallest domain for debugging env = Env(sqlconn, hostName) env_test = Env(sqlconn, hostName_test) # change language of start node. 0 = stop env.nodes[sys.maxsize].lang = languages.GetLang("None") env_test.nodes[sys.maxsize].lang = languages.GetLang("None") #for node in env.nodes.values(): # print(node.Debug()) max_env_maxLangId = max([env.maxLangId, env_test.maxLangId]) env.maxLangId = env_test.maxLangId = max_env_maxLangId tf.reset_default_graph() qns = Qnets(params, max_env_maxLangId) init = tf.global_variables_initializer() saver = None #tf.train.Saver() with tf.Session() as sess: sess.run(init) totRewards, totDiscountedRewards = Train(params, sess, saver, env, qns, env_test) #params.debug = True arrDumb = dumb(env, len(env.nodes), params) arrRandom = randomCrawl(env, len(env.nodes), params) arrBalanced = balanced(env, len(env.nodes), params) arrRL = Walk(env, params, sess, qns) #print("arrDumb", arrDumb) #print("arrBalanced", arrBalanced) plt.plot(arrDumb, label="dumb") plt.plot(arrRandom, label="random") plt.plot(arrBalanced, label="balanced") plt.plot(arrRL, label="RL") plt.legend(loc='upper left') plt.xlabel('#crawled') plt.ylabel('#found') plt.show()
def main(): global TIMER TIMER = Timer() oparser = argparse.ArgumentParser( description="intelligent crawling with q-learning") oparser.add_argument( "--config-file", dest="configFile", required=True, help="Path to config file (containing MySQL login etc.)") oparser.add_argument( "--language-pair", dest="langPair", required=True, help="The 2 language we're interested in, separated by ,") oparser.add_argument( "--save-dir", dest="saveDir", default=".", help= "Directory that model WIP are saved to. If existing model exists then load it" ) oparser.add_argument("--save-plots", dest="saveDirPlots", default="", help="Directory ") oparser.add_argument( "--delete-duplicate-transitions", dest="deleteDuplicateTransitions", default=False, help="If True then only unique transition are used in each batch") oparser.add_argument( "--n-hosts-train", dest="n_train", type=int, default=1, help="If True then only unique transition are used in each batch") oparser.add_argument( "--m-hosts-test", dest="m_test", type=int, default=1, help="If True then only unique transition are used in each batch") options = oparser.parse_args() np.random.seed(99) np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666) sqlconn = MySQL(options.configFile) languages = Languages(sqlconn.mycursor) #allhostNames = ["http://vade-retro.fr/", "http://vade-retro.fr/"] #hostName = "http://vade-retro.fr/" allhostNames = [ "http://www.buchmann.ch/", "http://vade-retro.fr/", "http://www.visitbritain.com/", "http://www.lespressesdureel.com/", "http://www.otc-cta.gc.ca/", "http://tagar.es/", "http://lacor.es/", "http://telasmos.org/", "http://www.haitilibre.com/", "http://legisquebec.gouv.qc.ca", "http://hobby-france.com/", "http://www.al-fann.net/", "http://www.antique-prints.de/", "http://www.gamersyde.com/", "http://inter-pix.com/", "http://www.acklandsgrainger.com/", "http://www.predialparque.pt/", "http://carta.ro/", "http://www.restopages.be/", "http://www.burnfateasy.info/", "http://www.bedandbreakfast.eu/", "http://ghc.freeguppy.org/", "http://www.bachelorstudies.fr/", "http://chopescollection.be/", "http://www.lavery.ca/", "http://www.thecanadianencyclopedia.ca/", "http://www.vistastamps.com/", "http://www.linker-kassel.com/", "http://www.enterprise.fr/" ] # "http://who.int/", shuffle(allhostNames) assert len(allhostNames) >= options.n_train + options.m_test #["http://vade-retro.fr/",] #["http://www.buchmann.ch/",] # hostNames_train = [ "http://www.haitilibre.com/" ] # "http://www.visitbritain.com/"]#["http://vade-retro.fr/",] # ["http://www.buchmann.ch/",]#"http://carta.ro/","http://www.bachelorstudies.fr/", "http://chopescollection.be/", "http://www.visitbritain.com/", "http://www.burnfateasy.info/"] #allhostNames[0:options.n_train] hostNames_test = [ "http://www.haitilibre.com/" ] #["http://www.lavery.ca/",] #allhostNames[options.n_train:options.n_train+options.m_test] if options.saveDirPlots: save_plots = 'plot' if not os.path.exists(save_plots): os.mkdir(save_plots) else: par_d = 'train{}test{}'.format(options.n_train, options.m_test) if not os.path.exists(par_d): os.mkdir(par_d) new_run = max( [int(run.replace('run', '')) for run in os.listdir(par_d)] + [0]) + 1 save_plots = '{}/run{}'.format(par_d, new_run) os.mkdir(save_plots) os.mkdir('{}/{}'.format(save_plots, 'train')) os.mkdir('{}/{}'.format(save_plots, 'test')) for hostName in hostNames_train: os.mkdir('{}/{}/{}'.format(save_plots, 'train', extract(hostName).domain)) for hostName in hostNames_test: os.mkdir('{}/{}/{}'.format(save_plots, 'test', extract(hostName).domain)) print("Training hosts are:") for h in hostNames_train: print(h) print() print("Testing hosts are:") for h in hostNames_test: print(h) print() with open('{}/hosts.info'.format(save_plots), 'w') as f: f.write('Training hosts are:\n') for h in hostNames_train: f.write(h + '\n') f.write('\nTesting hosts are:\n') for h in hostNames_test: f.write(h + '\n') params = LearningParams(languages, options.saveDir, save_plots, options.deleteDuplicateTransitions, options.langPair) env_train_dic = {} for hostName in hostNames_train: dom = extract(hostName).domain with open(dom, 'rb') as f: env_train_dic[hostName] = pickle.load(f) env_test_dic = {} for hostName in hostNames_test: dom = extract(hostName).domain with open(dom, 'rb') as f: env_test_dic[hostName] = pickle.load(f) #env_test_dic = {hostName:Env(sqlconn, hostName) for hostName in hostNames_test} print(env_train_dic) #hostName = "http://www.buchmann.ch/" #hostName_test = "http://www.visitbritain.com/" #env = Env(sqlconn, hostName) #env_test = Env(sqlconn, hostName_test) # change language of start node. 0 = stop #env.nodes[sys.maxsize].lang = languages.GetLang("None") #env_test.nodes[sys.maxsize].lang = languages.GetLang("None") #for node in env.nodes.values(): # print(node.Debug()) for dic in [env_train_dic, env_test_dic]: for hostName, env in dic.items(): env.maxLangId = MAX_LANG_ID env.nodes[sys.maxsize].lang = languages.GetLang("None") dic[hostName] = env tf.reset_default_graph() qns = Qnets(params, MAX_LANG_ID) #qns_test = Qnets(params, env_test) init = tf.global_variables_initializer() saver = None #tf.train.Saver() with tf.Session() as sess: sess.run(init) totRewards, totDiscountedRewards = Train(params, sess, saver, env_train_dic, qns, env_test_dic)
def main(): global TIMER TIMER = Timer() oparser = argparse.ArgumentParser( description="intelligent crawling with q-learning") oparser.add_argument( "--config-file", dest="configFile", required=True, help="Path to config file (containing MySQL login etc.)") oparser.add_argument( "--language-pair", dest="langPair", required=True, help="The 2 language we're interested in, separated by ,") oparser.add_argument( "--save-dir", dest="saveDir", default=".", help= "Directory that model WIP are saved to. If existing model exists then load it" ) oparser.add_argument("--save-plots", dest="saveDirPlots", default="plot", help="Directory ") oparser.add_argument( "--delete-duplicate-transitions", dest="deleteDuplicateTransitions", default=False, help="If True then only unique transition are used in each batch") options = oparser.parse_args() np.random.seed() np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666) sqlconn = MySQL(options.configFile) languages = Languages(sqlconn.mycursor) params = LearningParams(languages, options.saveDir, options.saveDirPlots, options.deleteDuplicateTransitions, options.langPair, languages.maxLangId, languages.GetLang("None")) if not os.path.exists(options.saveDirPlots): os.mkdir(options.saveDirPlots) #hostName = "http://vade-retro.fr/" hosts = ["http://www.buchmann.ch/" ] #, "http://telasmos.org/", "http://tagar.es/"] #hostName = "http://www.visitbritain.com/" #hostNameTest = "http://vade-retro.fr/" #hostNameTest = "http://www.buchmann.ch/" hostsTest = [ "http://www.visitbritain.com/", "http://chopescollection.be/", "http://www.bedandbreakfast.eu/" ] envs = GetEnvs(sqlconn, languages, hosts) envsTest = GetEnvs(sqlconn, languages, hostsTest) tf.reset_default_graph() qns = Qnets(params) init = tf.global_variables_initializer() saver = None #tf.train.Saver() with tf.Session() as sess: sess.run(init) totRewards, totDiscountedRewards = Train(params, sess, saver, qns, envs, envsTest)
def Main(): print("Starting") global TIMER TIMER = Timer() oparser = argparse.ArgumentParser( description="intelligent crawling with q-learning") oparser.add_argument( "--config-file", dest="configFile", required=True, help="Path to config file (containing MySQL login etc.") oparser.add_argument( "--save-dir", dest="saveDir", default=".", help= "Directory that model WIP are saved to. If existing model exists then load it" ) oparser.add_argument( "--delete-duplicate-transitions", dest="deleteDuplicateTransitions", default=False, help="If True then only unique transition are used in each batch") oparser.add_argument( "--language-pair", dest="langPair", required=True, help="The 2 language we're interested in, separated by ,") options = oparser.parse_args() np.random.seed() np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666) sqlconn = MySQL(options.configFile) hostName = "http://vade-retro.fr/" #hostName = "http://www.buchmann.ch/" #hostName = "http://www.visitbritain.com/" #pickleName = hostName + ".pickle" env = Env(sqlconn, hostName) # if os.path.exists(pickleName): # with open(pickleName, 'rb') as f: # print("unpickling") # env = pickle.load(f) # else: # env = Env(sqlconn, hostName) # with open(pickleName, 'wb') as f: # print("pickling") # pickle.dump(env,f) languages = Languages(sqlconn.mycursor) params = LearningParams(languages, options.saveDir, options.deleteDuplicateTransitions, options.langPair) tf.reset_default_graph() qns = Qnets(params, env) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: #with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess: sess.run(init) qns.q[0].PrintAllQ(params, env, sess) #WalkAll(env, params, sess, qns.q[0]) print() TIMER.Start("Train") totRewards, totDiscountedRewards = Train(params, sess, saver, env, qns) TIMER.Pause("Train") #qn.PrintAllQ(params, env, sess) #env.WalkAll(params, sess, qn) Walk(env, sys.maxsize, params, sess, qns.q[0], True) del TIMER plt.plot(totRewards) plt.plot(totDiscountedRewards) plt.show() plt.plot(qns.q[0].corpus.losses) plt.plot(qns.q[1].corpus.losses) plt.show() plt.plot(qns.q[0].corpus.sumWeights) plt.plot(qns.q[1].corpus.sumWeights) plt.show() print("Finished")