Exemplo n.º 1
0
def main():
    oparser = argparse.ArgumentParser(
        description="intelligent crawling with q-learning")
    oparser.add_argument(
        "--config-file",
        dest="configFile",
        required=True,
        help="Path to config file (containing MySQL login etc.)")
    options = oparser.parse_args()

    np.random.seed()
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)},
                        linewidth=666)

    sqlconn = MySQL(options.configFile)

    hostName = "http://vade-retro.fr/"
    # hostName = "http://www.buchmann.ch/"

    start = time.time()
    env = Env(sqlconn, hostName)
    end = time.time()

    print('Time to build the graph took: ', end - start, ' seconds.')
    # crawl_method1(sqlconn, env)

    start = time.time()
    crawl_method2(sqlconn, env)
    end = time.time()

    print('Time to crawl the graph took: ', end - start, ' seconds.')
Exemplo n.º 2
0
def PickleDomain(url):
    print("Pickling", url)
    domain = extract(url).domain

    if not os.path.exists('pickled_domains/' + domain):
        sqlconn = MySQL('config.ini')

        env = Env(sqlconn, url)

        with open('pickled_domains/' + domain, 'wb') as f:
            pickle.dump(env, f)

    print("Done {}".format(domain))
Exemplo n.º 3
0
def main():
    global DEBUG
    oparser = argparse.ArgumentParser(
        description="intelligent crawling with q-learning")
    oparser.add_argument(
        "--config-file",
        dest="configFile",
        required=True,
        help="Path to config file (containing MySQL login etc.)")
    oparser.add_argument(
        "--language-pair",
        dest="langPair",
        required=True,
        help="The 2 language we're interested in, separated by ,")
    options = oparser.parse_args()

    np.random.seed()
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)},
                        linewidth=666)

    sqlconn = MySQL(options.configFile)

    languages = Languages(sqlconn.mycursor)
    langPairList = options.langPair.split(",")
    assert (len(langPairList) == 2)
    langIds = [
        languages.GetLang(langPairList[0]),
        languages.GetLang(langPairList[1])
    ]

    #hostName = "http://vade-retro.fr/"
    #hostName = "http://www.buchmann.ch/"
    hostName = "http://www.visitbritain.com/"
    env = Env(sqlconn, hostName)

    #DEBUG = True
    arrNaive = naive(sqlconn, env, len(env.nodes))
    arrBalanced = balanced(sqlconn, env, len(env.nodes), langIds)
    #print("arrNaive", arrNaive)
    #print("arrBalanced", arrBalanced)
    plt.plot(arrNaive, label="naive")
    plt.plot(arrBalanced, label="balanced")
    plt.legend(loc='upper left')
    plt.show()
Exemplo n.º 4
0
def GetEnv(configFile, languages, url):
    domain = extract(url).domain
    filePath = 'pickled_domains/'+domain
    if not os.path.exists(filePath):
        print("mysql load", url)
        sqlconn = MySQL(configFile)
        env = Env(sqlconn, url)
    else:
        print("unpickle", url)
        with open(filePath, 'rb') as f:
            env = pickle.load(f)
    # change language of start node. 0 = stop
    env.nodes[sys.maxsize].lang = languages.GetLang("None")
    print("   ", len(env.nodes), "nodes,", env.numAligned, "aligned docs")
    #for node in env.nodes.values():
    #    print(node.Debug())

    print("env created", url)
    return env
Exemplo n.º 5
0
def main():
    global TIMER
    TIMER = Timer()

    oparser = argparse.ArgumentParser(description="intelligent crawling with q-learning")
    oparser.add_argument("--config-file", dest="configFile", required=True,
                         help="Path to config file (containing MySQL login etc.)")
    oparser.add_argument("--language-pair", dest="langPair", required=True,
                         help="The 2 language we're interested in, separated by ,")
    oparser.add_argument("--save-dir", dest="saveDir", default=".",
                         help="Directory that model WIP are saved to. If existing model exists then load it")
    oparser.add_argument("--save-plots", dest="saveDirPlots", default="plots",
                     help="Directory ")
    oparser.add_argument("--delete-duplicate-transitions", dest="deleteDuplicateTransitions",
                         default=False, help="If True then only unique transition are used in each batch")
    oparser.add_argument("--n-hosts-train", dest="n_train", type=int,
                         default=1, help="If True then only unique transition are used in each batch")    
    oparser.add_argument("--m-hosts-test", dest="m_test", type=int,
                         default=1, help="If True then only unique transition are used in each batch")    
    options = oparser.parse_args()

    np.random.seed(99)
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666)

    sqlconn = MySQL(options.configFile)

    languages = Languages(sqlconn.mycursor)

    #["http://vade-retro.fr/",] #
    hostNames_train = ["http://vade-retro.fr/"] 
    hostNames_test = ["http://vade-retro.fr/"] 
    #hostNames_train = ["http://www.buchmann.ch/"]
    #hostNames_test = ["http://www.visitbritain.com/"]
    #hostNames_train = ["http://carta.ro/","http://www.bachelorstudies.fr/", "http://www.buchmann.ch/", "http://chopescollection.be/", "http://www.visitbritain.com/", "http://www.burnfateasy.info/"] #allhostNames[0:options.n_train]
    #hostNames_test = ["http://www.lavery.ca/",] #allhostNames[options.n_train:options.n_train+options.m_test]
    
    if not os.path.exists(options.saveDirPlots):
        os.mkdir(options.saveDirPlots)

    print("Training hosts are:")
    for h in hostNames_train:
        print(h)
    print()
    print("Testing hosts are:")
    for h in hostNames_test:
        print(h)
    print()
    
    with open('{}/hosts.info'.format(options.saveDirPlots), 'w') as f:
        f.write('Training hosts are:\n')
        for h in hostNames_train:
            f.write(h+'\n')
        f.write('\nTesting hosts are:\n')
        for h in hostNames_test:
            f.write(h+'\n')
            
            
    params = LearningParams(languages, options.saveDir, options.saveDirPlots, options.deleteDuplicateTransitions, options.langPair)

    env_train_dic = {hostName:Env(sqlconn, hostName) for hostName in hostNames_train}
    env_test_dic = {hostName:Env(sqlconn, hostName) for hostName in hostNames_test}
        
    for dic in [env_train_dic, env_test_dic]:
        for hostName, env in env_test_dic.items():
            env.maxLangId = MAX_LANG_ID
            env.nodes[sys.maxsize].lang = languages.GetLang("None")
            dic[hostName] = env
        
        

    tf.reset_default_graph()
    qns = Qnets(params, MAX_LANG_ID)
    #qns_test = Qnets(params, env_test)
    init = tf.global_variables_initializer()

    saver = None #tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)

        totRewards, totDiscountedRewards = Train(params, sess, saver, env_train_dic, qns, env_test_dic)
Exemplo n.º 6
0
def main():
    global TIMER
    TIMER = Timer()

    oparser = argparse.ArgumentParser(description="intelligent crawling with q-learning")
    oparser.add_argument("--config-file", dest="configFile", required=True,
                         help="Path to config file (containing MySQL login etc.)")
    oparser.add_argument("--language-pair", dest="langPair", required=True,
                         help="The 2 language we're interested in, separated by ,")
    oparser.add_argument("--save-dir", dest="saveDir", default=".",
                         help="Directory that model WIP are saved to. If existing model exists then load it")
    oparser.add_argument("--save-plots", dest="saveDirPlots", default="plot",
                     help="Directory ")
    oparser.add_argument("--delete-duplicate-transitions", dest="deleteDuplicateTransitions",
                         default=False, help="If True then only unique transition are used in each batch")
    options = oparser.parse_args()

    np.random.seed()
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)}, linewidth=666)

    sqlconn = MySQL(options.configFile)

    languages = Languages(sqlconn.mycursor)
    params = LearningParams(languages, options.saveDir, options.saveDirPlots, options.deleteDuplicateTransitions, options.langPair)


    #hostName = "http://www.visitbritain.com/"
    #
    hostName = "http://www.buchmann.ch/"
    #hostName = "http://vade-retro.fr/"    # smallest domain for debugging


    hostName_test = "http://www.visitbritain.com/"
    #hostName_test = "http://www.buchmann.ch/"
    #hostName_test = "http://vade-retro.fr/"    # smallest domain for debugging

    env = Env(sqlconn, hostName)
    env_test = Env(sqlconn, hostName_test)

    # change language of start node. 0 = stop
    env.nodes[sys.maxsize].lang = languages.GetLang("None")
    env_test.nodes[sys.maxsize].lang = languages.GetLang("None")
    #for node in env.nodes.values():
    #    print(node.Debug())

    max_env_maxLangId = max([env.maxLangId, env_test.maxLangId])
    env.maxLangId = env_test.maxLangId = max_env_maxLangId

    tf.reset_default_graph()
    qns = Qnets(params, max_env_maxLangId)
    init = tf.global_variables_initializer()

    saver = None #tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)

        totRewards, totDiscountedRewards = Train(params, sess, saver, env, qns, env_test)

        #params.debug = True
        arrDumb = dumb(env, len(env.nodes), params)
        arrRandom = randomCrawl(env, len(env.nodes), params)
        arrBalanced = balanced(env, len(env.nodes), params)
        arrRL = Walk(env, params, sess, qns)
        #print("arrDumb", arrDumb)
        #print("arrBalanced", arrBalanced)
        
        plt.plot(arrDumb, label="dumb")
        plt.plot(arrRandom, label="random")
        plt.plot(arrBalanced, label="balanced")
        plt.plot(arrRL, label="RL")
        plt.legend(loc='upper left')
        plt.xlabel('#crawled')
        plt.ylabel('#found')
        plt.show()
Exemplo n.º 7
0
def main():
    global TIMER
    TIMER = Timer()

    oparser = argparse.ArgumentParser(
        description="intelligent crawling with q-learning")
    oparser.add_argument(
        "--config-file",
        dest="configFile",
        required=True,
        help="Path to config file (containing MySQL login etc.)")
    oparser.add_argument(
        "--language-pair",
        dest="langPair",
        required=True,
        help="The 2 language we're interested in, separated by ,")
    oparser.add_argument(
        "--save-dir",
        dest="saveDir",
        default=".",
        help=
        "Directory that model WIP are saved to. If existing model exists then load it"
    )
    oparser.add_argument("--save-plots",
                         dest="saveDirPlots",
                         default="",
                         help="Directory ")
    oparser.add_argument(
        "--delete-duplicate-transitions",
        dest="deleteDuplicateTransitions",
        default=False,
        help="If True then only unique transition are used in each batch")
    oparser.add_argument(
        "--n-hosts-train",
        dest="n_train",
        type=int,
        default=1,
        help="If True then only unique transition are used in each batch")
    oparser.add_argument(
        "--m-hosts-test",
        dest="m_test",
        type=int,
        default=1,
        help="If True then only unique transition are used in each batch")
    options = oparser.parse_args()

    np.random.seed(99)
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)},
                        linewidth=666)

    sqlconn = MySQL(options.configFile)

    languages = Languages(sqlconn.mycursor)

    #allhostNames = ["http://vade-retro.fr/", "http://vade-retro.fr/"]
    #hostName = "http://vade-retro.fr/"
    allhostNames = [
        "http://www.buchmann.ch/", "http://vade-retro.fr/",
        "http://www.visitbritain.com/", "http://www.lespressesdureel.com/",
        "http://www.otc-cta.gc.ca/", "http://tagar.es/", "http://lacor.es/",
        "http://telasmos.org/", "http://www.haitilibre.com/",
        "http://legisquebec.gouv.qc.ca", "http://hobby-france.com/",
        "http://www.al-fann.net/", "http://www.antique-prints.de/",
        "http://www.gamersyde.com/", "http://inter-pix.com/",
        "http://www.acklandsgrainger.com/", "http://www.predialparque.pt/",
        "http://carta.ro/", "http://www.restopages.be/",
        "http://www.burnfateasy.info/", "http://www.bedandbreakfast.eu/",
        "http://ghc.freeguppy.org/", "http://www.bachelorstudies.fr/",
        "http://chopescollection.be/", "http://www.lavery.ca/",
        "http://www.thecanadianencyclopedia.ca/",
        "http://www.vistastamps.com/", "http://www.linker-kassel.com/",
        "http://www.enterprise.fr/"
    ]

    #                    "http://who.int/",
    shuffle(allhostNames)

    assert len(allhostNames) >= options.n_train + options.m_test
    #["http://vade-retro.fr/",] #["http://www.buchmann.ch/",] #
    hostNames_train = [
        "http://www.haitilibre.com/"
    ]  # "http://www.visitbritain.com/"]#["http://vade-retro.fr/",] # ["http://www.buchmann.ch/",]#"http://carta.ro/","http://www.bachelorstudies.fr/",  "http://chopescollection.be/", "http://www.visitbritain.com/", "http://www.burnfateasy.info/"] #allhostNames[0:options.n_train]
    hostNames_test = [
        "http://www.haitilibre.com/"
    ]  #["http://www.lavery.ca/",] #allhostNames[options.n_train:options.n_train+options.m_test]

    if options.saveDirPlots:

        save_plots = 'plot'

        if not os.path.exists(save_plots): os.mkdir(save_plots)
    else:

        par_d = 'train{}test{}'.format(options.n_train, options.m_test)

        if not os.path.exists(par_d): os.mkdir(par_d)

        new_run = max(
            [int(run.replace('run', ''))
             for run in os.listdir(par_d)] + [0]) + 1

        save_plots = '{}/run{}'.format(par_d, new_run)

        os.mkdir(save_plots)
        os.mkdir('{}/{}'.format(save_plots, 'train'))
        os.mkdir('{}/{}'.format(save_plots, 'test'))

        for hostName in hostNames_train:
            os.mkdir('{}/{}/{}'.format(save_plots, 'train',
                                       extract(hostName).domain))
        for hostName in hostNames_test:
            os.mkdir('{}/{}/{}'.format(save_plots, 'test',
                                       extract(hostName).domain))

    print("Training hosts are:")
    for h in hostNames_train:
        print(h)
    print()
    print("Testing hosts are:")
    for h in hostNames_test:
        print(h)
    print()

    with open('{}/hosts.info'.format(save_plots), 'w') as f:
        f.write('Training hosts are:\n')
        for h in hostNames_train:
            f.write(h + '\n')
        f.write('\nTesting hosts are:\n')
        for h in hostNames_test:
            f.write(h + '\n')

    params = LearningParams(languages, options.saveDir, save_plots,
                            options.deleteDuplicateTransitions,
                            options.langPair)

    env_train_dic = {}
    for hostName in hostNames_train:
        dom = extract(hostName).domain
        with open(dom, 'rb') as f:
            env_train_dic[hostName] = pickle.load(f)

    env_test_dic = {}
    for hostName in hostNames_test:
        dom = extract(hostName).domain
        with open(dom, 'rb') as f:
            env_test_dic[hostName] = pickle.load(f)

    #env_test_dic = {hostName:Env(sqlconn, hostName) for hostName in hostNames_test}

    print(env_train_dic)
    #hostName = "http://www.buchmann.ch/"
    #hostName_test = "http://www.visitbritain.com/"
    #env = Env(sqlconn, hostName)
    #env_test = Env(sqlconn, hostName_test)
    # change language of start node. 0 = stop
    #env.nodes[sys.maxsize].lang = languages.GetLang("None")
    #env_test.nodes[sys.maxsize].lang = languages.GetLang("None")
    #for node in env.nodes.values():
    #    print(node.Debug())

    for dic in [env_train_dic, env_test_dic]:
        for hostName, env in dic.items():
            env.maxLangId = MAX_LANG_ID
            env.nodes[sys.maxsize].lang = languages.GetLang("None")
            dic[hostName] = env

    tf.reset_default_graph()
    qns = Qnets(params, MAX_LANG_ID)
    #qns_test = Qnets(params, env_test)
    init = tf.global_variables_initializer()

    saver = None  #tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)

        totRewards, totDiscountedRewards = Train(params, sess, saver,
                                                 env_train_dic, qns,
                                                 env_test_dic)
Exemplo n.º 8
0
def main():
    global TIMER
    TIMER = Timer()

    oparser = argparse.ArgumentParser(
        description="intelligent crawling with q-learning")
    oparser.add_argument(
        "--config-file",
        dest="configFile",
        required=True,
        help="Path to config file (containing MySQL login etc.)")
    oparser.add_argument(
        "--language-pair",
        dest="langPair",
        required=True,
        help="The 2 language we're interested in, separated by ,")
    oparser.add_argument(
        "--save-dir",
        dest="saveDir",
        default=".",
        help=
        "Directory that model WIP are saved to. If existing model exists then load it"
    )
    oparser.add_argument("--save-plots",
                         dest="saveDirPlots",
                         default="plot",
                         help="Directory ")
    oparser.add_argument(
        "--delete-duplicate-transitions",
        dest="deleteDuplicateTransitions",
        default=False,
        help="If True then only unique transition are used in each batch")
    options = oparser.parse_args()

    np.random.seed()
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)},
                        linewidth=666)

    sqlconn = MySQL(options.configFile)

    languages = Languages(sqlconn.mycursor)
    params = LearningParams(languages, options.saveDir, options.saveDirPlots,
                            options.deleteDuplicateTransitions,
                            options.langPair, languages.maxLangId,
                            languages.GetLang("None"))

    if not os.path.exists(options.saveDirPlots): os.mkdir(options.saveDirPlots)

    #hostName = "http://vade-retro.fr/"
    hosts = ["http://www.buchmann.ch/"
             ]  #, "http://telasmos.org/", "http://tagar.es/"]
    #hostName = "http://www.visitbritain.com/"

    #hostNameTest = "http://vade-retro.fr/"
    #hostNameTest = "http://www.buchmann.ch/"
    hostsTest = [
        "http://www.visitbritain.com/", "http://chopescollection.be/",
        "http://www.bedandbreakfast.eu/"
    ]

    envs = GetEnvs(sqlconn, languages, hosts)
    envsTest = GetEnvs(sqlconn, languages, hostsTest)

    tf.reset_default_graph()
    qns = Qnets(params)
    init = tf.global_variables_initializer()

    saver = None  #tf.train.Saver()
    with tf.Session() as sess:
        sess.run(init)

        totRewards, totDiscountedRewards = Train(params, sess, saver, qns,
                                                 envs, envsTest)
Exemplo n.º 9
0
def Main():
    print("Starting")
    global TIMER
    TIMER = Timer()

    oparser = argparse.ArgumentParser(
        description="intelligent crawling with q-learning")
    oparser.add_argument(
        "--config-file",
        dest="configFile",
        required=True,
        help="Path to config file (containing MySQL login etc.")
    oparser.add_argument(
        "--save-dir",
        dest="saveDir",
        default=".",
        help=
        "Directory that model WIP are saved to. If existing model exists then load it"
    )
    oparser.add_argument(
        "--delete-duplicate-transitions",
        dest="deleteDuplicateTransitions",
        default=False,
        help="If True then only unique transition are used in each batch")
    oparser.add_argument(
        "--language-pair",
        dest="langPair",
        required=True,
        help="The 2 language we're interested in, separated by ,")
    options = oparser.parse_args()

    np.random.seed()
    np.set_printoptions(formatter={'float': lambda x: "{0:0.1f}".format(x)},
                        linewidth=666)

    sqlconn = MySQL(options.configFile)

    hostName = "http://vade-retro.fr/"
    #hostName = "http://www.buchmann.ch/"
    #hostName = "http://www.visitbritain.com/"
    #pickleName = hostName + ".pickle"

    env = Env(sqlconn, hostName)
    # if os.path.exists(pickleName):
    #     with open(pickleName, 'rb') as f:
    #         print("unpickling")
    #         env = pickle.load(f)
    # else:
    #     env = Env(sqlconn, hostName)
    #     with open(pickleName, 'wb') as f:
    #         print("pickling")
    #         pickle.dump(env,f)

    languages = Languages(sqlconn.mycursor)
    params = LearningParams(languages, options.saveDir,
                            options.deleteDuplicateTransitions,
                            options.langPair)

    tf.reset_default_graph()
    qns = Qnets(params, env)
    init = tf.global_variables_initializer()

    saver = tf.train.Saver()
    with tf.Session() as sess:
        #with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
        sess.run(init)

        qns.q[0].PrintAllQ(params, env, sess)
        #WalkAll(env, params, sess, qns.q[0])
        print()

        TIMER.Start("Train")
        totRewards, totDiscountedRewards = Train(params, sess, saver, env, qns)
        TIMER.Pause("Train")

        #qn.PrintAllQ(params, env, sess)
        #env.WalkAll(params, sess, qn)

        Walk(env, sys.maxsize, params, sess, qns.q[0], True)

        del TIMER

        plt.plot(totRewards)
        plt.plot(totDiscountedRewards)
        plt.show()

        plt.plot(qns.q[0].corpus.losses)
        plt.plot(qns.q[1].corpus.losses)
        plt.show()

        plt.plot(qns.q[0].corpus.sumWeights)
        plt.plot(qns.q[1].corpus.sumWeights)
        plt.show()

    print("Finished")