Пример #1
0
    def crawl(self):
        self.get_pattern(self.dataset, self.cluster_rank)
        self.a = annotator(self.dataset)

        write_file = open(
            "./results/vidal_{0}_{1}_{2}_size{3}.txt".format(
                self.dataset, self.date, self.cluster_rank, self.crawl_size),
            "w")
        num_web_crawl = 0
        entry, prefix = self.entry, self.prefix
        self.url_stack = [(entry, "", 0)]
        self.final_list = []
        size, num = self.crawl_size, 0  # the number of crawling
        s = sampler(self.dataset, self.entry, self.prefix, 0)
        while (num < size and len(self.url_stack) > 0):
            first_url = self.url_stack[0][0]
            parent_url = self.url_stack[0][1]
            rule_id = self.url_stack[0][2]
            try:
                print "first url is ", first_url
            except:
                traceback.print_exc()

            if first_url not in self.history_set:
                num += 1
                try:
                    url_list, new_rule_id = self.crawl_link(
                        first_url, rule_id, self.history_set, s)
                    self.final_list.append((first_url, parent_url, rule_id))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = s.crawlUrl(first_url, self.dataset, self.url_stack,
                                      self.history_set)
                    if flag == 1:
                        url_list, new_rule_id = self.crawl_link(
                            first_url, rule_id, self.history_set, s)
                        self.final_list.append(
                            (first_url, parent_url, rule_id))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl % 10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            if self.url_stack[0][0] == first_url:
                self.url_stack.pop(0)
            print " num is {}".format(num)
            sys.stdout.flush()
            self.history_set.add(first_url)

        print len(self.final_list), "length of final list"

        for pair in self.final_list:
            url, parent_url, cluster_id = pair[0], pair[1], pair[2]
            write_file.write(url + "\t" + str(parent_url) + "\t" +
                             str(cluster_id) + '\n')
Пример #2
0
async def set_rule(sid, data):
    name, method, direction, criteria, exclude = data["name"], data[
        "method"], data["direction"], data["criteria"], data["exclude"]
    rules[name] = sampler(name, method, direction, criteria, exclude)
    await sio.emit('rules',
                   json.dumps(rules, cls=AdvancedJSONEncoder),
                   room=sid)
Пример #3
0
    def crawl(self):
        self.get_pattern(self.dataset, self.cluster_rank)
        self.a = annotator(self.dataset)

        write_file = open(
            "./results/vidal_{0}_{1}_{2}_size{3}.txt".format(
                self.dataset, self.date, self.cluster_rank, self.crawl_size
            ),
            "w",
        )
        num_web_crawl = 0
        entry, prefix = self.entry, self.prefix
        self.url_stack = [(entry, "", 0)]
        self.final_list = []
        size, num = self.crawl_size, 0  # the number of crawling
        s = sampler(self.dataset, self.entry, self.prefix, 0)
        while num < size and len(self.url_stack) > 0:
            first_url = self.url_stack[0][0]
            parent_url = self.url_stack[0][1]
            rule_id = self.url_stack[0][2]
            try:
                print "first url is ", first_url
            except:
                traceback.print_exc()

            if first_url not in self.history_set:
                num += 1
                try:
                    url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s)
                    self.final_list.append((first_url, parent_url, rule_id))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = s.crawlUrl(first_url, self.dataset, self.url_stack, self.history_set)
                    if flag == 1:
                        url_list, new_rule_id = self.crawl_link(first_url, rule_id, self.history_set, s)
                        self.final_list.append((first_url, parent_url, rule_id))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl % 10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            if self.url_stack[0][0] == first_url:
                self.url_stack.pop(0)
            print " num is {}".format(num)
            sys.stdout.flush()
            self.history_set.add(first_url)

        print len(self.final_list), "length of final list"

        for pair in self.final_list:
            url, parent_url, cluster_id = pair[0], pair[1], pair[2]
            write_file.write(url + "\t" + str(parent_url) + "\t" + str(cluster_id) + "\n")
Пример #4
0
async def read_rule(sid):
    global rules
    try:
        #讀取規則集
        with open("ruleset.json", "r") as f:
            rules_in_json = json.load(f)
        #建構Sampler
        rules = {
            key: sampler(data["name"], data["method"], data["direction"],
                         data["criteria"], [])
            for (key, data) in rules_in_json.items()
        }
        await sio.emit('rules',
                       json.dumps(rules, cls=AdvancedJSONEncoder),
                       room=sid)
    except Exception as e:
        await sio.emit('error', str(e), room=sid)
Пример #5
0
def train_and_sample(gen,
                     dis,
                     dataset,
                     sample_dataset,
                     config,
                     stage=1,
                     pre_gen=None,
                     pri_img_every=100,
                     save_path='',
                     sample_every=10,
                     save_every=100):
    """
    This function is used to train the generator and discriminator of both stage 1 and 2.
    gen: Generator object.
    dis: Discriminator object.
    dataset: The dataloader which is used to obtain images for training.
    sample_dataset: The dataset from which pre trained sentence embeddings for sampling is obtained.
    config: Dictionary containing the hyperparameters.
    stage: Indicates which stage so that stage specific operations can be done.
    pre_gen: If stage 2 this gives the path to the trained stage 1 generator.
    pri_img_every: Tells the iteration frequency for which loss and fake images generated has to be displayed.
    save_path: Containes the path to which the models and the images has to be stored.
    sample_every: Indicates the epoch frequency for sampling images from the trained generator.
    save_every: Indicates the epoch frequency for which the model has to be saved.
    """

    if (stage == 2 and pre_gen == None):
        return 'Give the path to the trained stage 1 generator'
    elif (stage == 2 and pre_gen != None):
        gen.gen_1.load_state_dict(torch.load(pre_gen)['generator'])

    if (stage == 1):
        save_path = os.path.join(save_path, '1')
    else:
        save_path = os.path.join(save_path, '2')

    params = config
    noise_dim = params['noise_dim']
    batch_size = params['batch_size']
    gen_lr = params['gen_lr']
    dis_lr = params['dis_lr']

    noise = torch.FloatTensor(batch_size, noise_dim)
    noise = noise.to(device)

    imgen_noise = torch.FloatTensor(batch_size, noise_dim).normal_(0, 1)
    imgen_noise = imgen_noise.to(device)

    real_labels = torch.FloatTensor(batch_size).fill_(1)
    fake_labels = torch.FloatTensor(batch_size).fill_(0)
    real_labels = real_labels.to(device)
    fake_labels = fake_labels.to(device)

    optimizer_dis = optim.Adam(dis.parameters(), lr=dis_lr, betas=(0.5, 0.999))
    gen_layers = []
    for layer in gen.parameters(
    ):  #ommiting the stage 1 generator layers in stage 2 for optimizing.
        if layer.requires_grad:
            gen_layers.append(layer)
    optimizer_gen = optim.Adam(gen_layers, lr=gen_lr, betas=(0.5, 0.999))

    for epoch in range(params['epoch']):
        er_d = []
        er_g = []
        kl = []
        start = time.time()
        print('Epoch {}'.format(epoch + 1))
        if (epoch > 0 and ((epoch + 1) % params['lr_decay_epoch'] == 0)
            ):  #decaying the learning rate after every specified interval
            gen_lr *= 0.5
            for par in optimizer_gen.param_groups:
                par['lr'] = gen_lr
            dis_lr *= 0.5
            for par in optimizer_dis.param_groups:
                par['lr'] = dis_lr

        for i, data in enumerate(dataset, 0):
            real_image, embedding = data
            real_image = real_image.to(device)
            embedding = embedding.to(device)

            noise.data.normal_(0, 1)
            gen.train()
            _, fake_image, mean, variance = gen(embedding,
                                                noise)  #genrate fake image

            dis.zero_grad()  #updating discriminator
            error_d, real_error, wrong_error, fake_error = discriminator_loss(
                dis, fake_image, real_image, fake_labels, real_labels, mean,
                stage)
            er_d.append(error_d.item())
            error_d.backward()
            optimizer_dis.step()

            gen.zero_grad()  #updating generator
            error_g = generator_loss(dis, fake_image, real_labels, mean)
            er_g.append(error_g.item())
            kl_los = kl_loss(mean, variance)
            kl.append(kl_los.item())
            total_error = error_g + kl_los * params['kl_coeff']
            total_error.backward()
            optimizer_gen.step()

            if (((i + 1) % pri_img_every) == 0):
                print('Discriminator_error: {}'.format(error_d.item()))
                print('Generator_error:{}'.format(error_g.item()))
                print('KL loss:{}'.format(kl_los.item()))

                print('Running discriminator loss: {}'.format(
                    sum(er_d) / len(er_d)))
                print('Running generator loss: {}'.format(
                    sum(er_g) / len(er_g)))
                print('Running KL loss: {}'.format(sum(kl) / len(kl)))

                previous, current, _, _ = gen(embedding, imgen_noise)
                save_image(real_image, current, epoch + 1,
                           os.path.join(save_path, 'images'))
                show = utils.make_grid(real_image[0:16])
                image_show(show)
                show = utils.make_grid(current[0:16])
                image_show(show)
                if previous is not None:
                    save_image(None, previous, epoch + 1,
                               os.path.join(save_path, 'images'))

        elapsed_time = time.time() - start
        print('Epoch {} completed in {:.0f}minutes {:.0f}seconds'.format(
            epoch + 1, elapsed_time // 60, elapsed_time % 60))
        print('Discriminator loss for this epoch: {}'.format(
            sum(er_d) / len(er_d)))
        print('Generator loss for this epoch: {}'.format(
            sum(er_g) / len(er_g)))
        print('KL loss for this epoch: {}'.format(sum(kl) / len(kl)))

        if ((epoch + 1) % save_every == 0):
            save_model(gen,
                       dis,
                       optimizer_gen,
                       optimizer_dis,
                       epoch + 1,
                       os.path.join(save_path, 'model'),
                       stage=stage)
        if ((epoch + 1) % sample_every == 0):
            sampler(gen,
                    sample_dataset,
                    epoch + 1,
                    noise=imgen_noise,
                    save_path=os.path.join(save_path, 'images'))

    save_model(gen,
               dis,
               optimizer_gen,
               optimizer_dis,
               params['epoch'],
               os.path.join(save_path, 'model'),
               stage=stage)
Пример #6
0
    def crawling(self, crawl_size=1000):
        if not os.path.exists("./results/irobot/"):
            os.mkdir("./results/irobot/")
        write_file = open(
            "./results/irobot/{0}_irobot_size{1}.txt".format(
                self.dataset, crawl_size), "w")
        entry, prefix = self.entry, self.prefix
        self.url_stack = [(entry, "", 0)]  #(entry,parent_url,crawl_level)
        self.final_list = []
        size, num = crawl_size, 0  # the number of crawling
        crawl_id = 0
        s = sampler(self.dataset, self.entry, self.prefix, 0)
        end = 0
        num_web_crawl = 0
        while (num < size and len(self.url_stack) > 0):
            print self.url_stack[-1]
            print self.url_stack[0]
            first_url = self.url_stack[end][0]
            parent_url = self.url_stack[end][1]
            crawl_level = self.url_stack[end][2]

            try:
                print "first url is ", first_url

            except:
                traceback.print_exc()

            if first_url not in self.history_set:
                num += 1
                try:
                    url_list = self.crawl_link(first_url, crawl_level,
                                               self.history_set, s)
                    print "url list", len(url_list)
                    self.url_stack.pop(end)
                    self.url_stack += url_list
                    self.final_list.append(
                        (first_url, parent_url, crawl_level))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = self.crawlUrl(first_url, self.dataset,
                                         self.url_stack, self.history_set)
                    if flag == 1:
                        url_list = self.crawl_link(first_url, crawl_level,
                                                   self.history_set, s)
                        self.url_stack.pop(end)
                        print "url list", len(url_list)
                        self.url_stack += url_list
                        self.final_list.append(
                            (first_url, parent_url, crawl_level))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl % 10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            else:
                self.url_stack.pop(end)

            end = random.choice([0, -1])
            print "end is ", end
            crawl_id += 1
            print " num is {}".format(num)
            sys.stdout.flush()
            if num >= size:
                print "crawl_id is {0} for size {1}".format(crawl_id, size)

            self.history_set.add(first_url)
        print len(self.final_list), "length of final list"

        for pair in self.final_list:
            url, parent_url, crawl_level = pair[0], pair[1], pair[2]
            write_file.write(url + "\t" + str(parent_url) + "\t" +
                             str(crawl_level) + '\n')
Пример #7
0
    def crawling(self, num_crawl):

        counter = Counter(sitemap.UP_pages.category)
        self.c_prob = defaultdict(float)
        total = sum(counter.values())
        for key in counter:
            self.c_prob[key] = float(counter[key]) / float(total)

        # self.entry, self.prefix, self.dataset, self.trans_xpath_dict, target_cluste id
        #self.target_cluster = self.get_sample_cluster()
        write_file = open(
            "./results/{0}_{1}_{2}_{3}_size{4}.txt".format(
                self.dataset, self.date, self.cluster_rank, self.rank_algo,
                self.crawl_size), "w")
        num_web_crawl = 0
        entry, prefix = self.entry, self.prefix
        self.url_stack = [(entry, "", "", self.max_score)]
        self.final_list = []
        size, num = num_crawl, 0  # the number of crawling
        crawl_id = 0
        s = sampler(self.dataset, self.entry, self.prefix, 0)
        while (num < size and len(self.url_stack) > 0):
            first_url = self.url_stack[0][0]
            parent_url = self.url_stack[0][1]
            parent_xpath = self.url_stack[0][2]
            score = self.url_stack[0][3]
            print self.url_stack[0]
            print self.url_stack[-1]

            #first_url = self.url_stack[0][0]
            try:
                print "first url is " + first_url

            except:
                traceback.print_exc()
            if first_url not in self.history_set:
                num += 1
                try:
                    url_list, cluster_id = self.crawl_link(
                        first_url, self.history_set, s)
                    #print "url_list", url_list
                    self.sort_queue(url_list, first_url,
                                    self.rank_algo)  # sort url_stack
                    self.final_list.append((first_url, parent_url,
                                            parent_xpath, score, cluster_id))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = s.crawlUrl(first_url, self.dataset, self.url_stack,
                                      self.history_set)
                    if flag == 1:
                        url_list, cluster_id = self.crawl_link(
                            first_url, self.history_set, s)
                        self.sort_queue(url_list,
                                        first_url,
                                        rank_algo=self.rank_algo)
                        self.final_list.append(
                            (first_url, parent_url, parent_xpath, score,
                             cluster_id))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl % 10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            if self.url_stack[0][0] == first_url:
                self.url_stack.pop(0)
            crawl_id += 1
            print " num is {}".format(num)
            sys.stdout.flush()
            if num >= size:
                print "crawl_id is {0} for size {1}".format(crawl_id, size)

                #print "first url comes from the {} th crawled page".format(self.page_num[first_url])
            self.history_set.add(first_url)
        print len(self.final_list), "length of final list"
        for pair in self.final_list:
            url, parent_url, parent_xpath, score, cluster_id = pair[0], pair[
                1], pair[2], pair[3], pair[4]
            write_file.write(url + "\t" + str(parent_url) + "\t" +
                             str(parent_xpath) + "\t" + str(score) + "\t" +
                             str(cluster_id) + '\n')
Пример #8
0
    def sampling(self, num_crawl, method="uniform"):
        # need to read the pagerank dict from file
        if method == "pagerank":
            path = "./src/data/{0}/{0}.pr_dict".format(self.dataset)
            with open(path, "rb") as outfile:
                pr_dict = pickle.load(outfile)
            avg_pr = sum(pr_dict.values()) / len(pr_dict)
            print avg_pr, "average pagerank"
        elif method == "indegree":
            path = "./src/data/{0}/{0}.inlink_dict".format(self.dataset)
            with open(path, "rb") as outfile:
                inlink = pickle.load(outfile)
            indegree_dict = defaultdict(int)
            for key in inlink:
                indegree_dict[key] = len(inlink[key])
            avg_indegree = sum(indegree_dict.values()) / len(indegree_dict)
        elif method == "est_prob":
            counter = Counter(sitemap.UP_pages.category)
            self.c_prob = defaultdict(float)
            total = sum(counter.values())
            for key in counter:
                self.c_prob[key] = float(counter[key]) / float(total)

        self.crawl_history = Counter()
        for i in range(self.cluster_num):
            self.crawl_history[i] = 1

        write_file = open(
            "./results/sampling/random_{0}_{1}_size{2}.txt".format(
                method, self.dataset, self.crawl_size), "w")
        num_web_crawl = 0
        entry, prefix = self.entry, self.prefix
        self.url_stack, self.crawl_length = [entry], 0
        self.final_list, url_list, last_list = [], [], []
        size, num = num_crawl, 0  # the number of crawling
        s = sampler(self.dataset, self.entry, self.prefix, 0)
        while (num < size and len(self.url_stack) > 0):
            first_url = self.url_stack[0]
            print "first_url", first_url
            try:
                sys.stdout.write("num is {}\n".format(num))
                sys.stdout.flush()
                #print num, "num"
                url_list, cluster_id = self.sample_link(first_url, s, method)
                if first_url not in self.history_set:
                    self.final_list.append((first_url, cluster_id))
                    num += 1
                # add url to sample history anyway
                self.crawl_history[cluster_id] += 1
                self.crawl_length += 1

            except:
                print "might miss somthing here"
                traceback.print_exc()
                flag = s.crawlUrl(first_url, self.dataset, self.url_stack,
                                  self.history_set)
                if flag == 1:
                    sys.stdout.write("num is {}\n".format(num))
                    sys.stdout.flush()
                    #print num, "num"
                    url_list, cluster_id = self.sample_link(
                        first_url, s, method)
                    print url_list
                    if first_url not in self.history_set:
                        num += 1
                        self.final_list.append((first_url, cluster_id))
                    random_time_s = random.randint(5, 10)
                    time.sleep(random_time_s)
                    #num_web_crawl += 1
                    if num_web_crawl % 10 == 9:
                        random_time_s = random.randint(60, 90)
                        time.sleep(random_time_s)
                else:
                    #change the first_url from parent sampling
                    print num, "num"
                    traceback.print_exc()
                    pass

            if self.url_stack[0] == first_url:
                self.url_stack.pop(0)
                self.history_set.add(first_url)

            probability = 0.15
            if method == "uniform":
                # after processing, 0.15 random and 0.85 uniform sampling
                # no out-links - random sample
                if random.random() < probability:
                    self.select_from_history_set()
                else:
                    try:
                        print url_list
                        id = random.randrange(len(url_list))
                        self.url_stack.append(url_list[id])
                        print url_list[id], "select from out-links"
                    except:
                        self.select_from_history_set()

            elif method == "pagerank":
                if random.random() < probability:
                    url = random.sample(self.history_set, 1)[0]
                    self.url_stack.append(url)
                    print url, "random sampled from history set"
                else:
                    try:
                        id = self.sample_from_dist(url_list, pr_dict, avg_pr)
                        self.url_stack.append(url_list[id])
                        print url_list[id], "select from out-links"
                    except:
                        self.select_from_history_set()
            elif method == "indegree":
                print "sample from orcacle indegree"
                if random.random() < probability:
                    url = random.sample(self.history_set, 1)[0]
                    self.url_stack.append(url)
                    print url, "random sampled from history set"
                else:
                    try:
                        id = self.sample_from_dist(url_list, indegree_dict,
                                                   avg_indegree)
                        self.url_stack.append(url_list[id])
                        print url_list[id], "select from out-links"
                    except:
                        traceback.print_exc()
                        self.select_from_history_set()

            else:  # our method
                if random.random() < probability:
                    url = random.sample(self.history_set, 1)[0]
                    self.url_stack.append(url)
                    print url, "random sampled from history set"
                else:
                    try:
                        id = self.sample_from_prob_list(url_list)
                        self.url_stack.append(url_list[id][0])
                        print url_list[id], "select from out-links"
                    except:
                        traceback.print_exc()
                        self.select_from_history_set()

        print len(self.final_list), "length of final list"
        for pair in self.final_list:
            url, cluster_id = pair[0], pair[1]
            write_file.write(url + "\t" + str(cluster_id) + '\n')
Пример #9
0
    def crawling(self,num_crawl):

        counter = Counter(sitemap.UP_pages.category)
        self.c_prob = defaultdict(float)
        total = sum(counter.values())
        for key in counter:
            self.c_prob[key] = float(counter[key])/float(total)

        # self.entry, self.prefix, self.dataset, self.trans_xpath_dict, target_cluste id
        #self.target_cluster = self.get_sample_cluster()
        write_file = open("./results/{0}_{1}_{2}_{3}_size{4}.txt".format(self.dataset,self.date, self.cluster_rank, self.rank_algo, self.crawl_size),"w")
        num_web_crawl = 0
        entry,prefix = self.entry, self.prefix
        self.url_stack  = [(entry,"","",self.max_score)]
        self.final_list = []
        size, num = num_crawl,  0 # the number of crawling
        crawl_id = 0
        s = sampler(self.dataset,self.entry,self.prefix,0)
        while(num<size and len(self.url_stack)>0):
            first_url = self.url_stack[0][0]
            parent_url = self.url_stack[0][1]
            parent_xpath = self.url_stack[0][2]
            score = self.url_stack[0][3]
            print self.url_stack[0]
            print self.url_stack[-1]

            #first_url = self.url_stack[0][0]
            try:
                print "first url is " + first_url

            except:
                traceback.print_exc()
            if first_url not in self.history_set:
                num += 1
                try:
                    url_list,cluster_id = self.crawl_link(first_url,  self.history_set, s)
                    #print "url_list", url_list
                    self.sort_queue(url_list,first_url,self.rank_algo) # sort url_stack
                    self.final_list.append((first_url,parent_url,parent_xpath,score,cluster_id))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = s.crawlUrl(first_url,self.dataset,self.url_stack,self.history_set)
                    if flag == 1:
                        url_list,cluster_id = self.crawl_link(first_url,  self.history_set,s )
                        self.sort_queue(url_list,first_url,rank_algo=self.rank_algo)
                        self.final_list.append((first_url,parent_url,parent_xpath,score,cluster_id))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl%10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            if self.url_stack[0][0] == first_url:
                self.url_stack.pop(0)
            crawl_id += 1
            print " num is {}".format(num)
            sys.stdout.flush()
            if num >= size:
                print "crawl_id is {0} for size {1}".format(crawl_id,size)

                #print "first url comes from the {} th crawled page".format(self.page_num[first_url])
            self.history_set.add(first_url)
        print len(self.final_list), "length of final list"
        for pair in self.final_list:
            url, parent_url,parent_xpath,score, cluster_id = pair[0],pair[1],pair[2],pair[3],pair[4]
            write_file.write(url + "\t" + str(parent_url) +"\t" + str(parent_xpath) + "\t" + str(score) +  "\t"+ str(cluster_id) + '\n')
Пример #10
0
    def sampling(self,num_crawl,method="uniform"):
        # need to read the pagerank dict from file
        if method == "pagerank":
            path = "./src/data/{0}/{0}.pr_dict".format(self.dataset)
            with open(path,"rb") as outfile:
                pr_dict = pickle.load(outfile)
            avg_pr = sum(pr_dict.values())/len(pr_dict)
            print avg_pr, "average pagerank"
        elif method == "indegree":
            path = "./src/data/{0}/{0}.inlink_dict".format(self.dataset)
            with open(path,"rb") as outfile:
                inlink = pickle.load(outfile)
            indegree_dict = defaultdict(int)
            for key in inlink:
                indegree_dict[key] = len(inlink[key])
            avg_indegree = sum(indegree_dict.values())/len(indegree_dict)
        elif method == "est_prob":
            counter = Counter(sitemap.UP_pages.category)
            self.c_prob = defaultdict(float)
            total = sum(counter.values())
            for key in counter:
                self.c_prob[key] = float(counter[key])/float(total)

        self.crawl_history = Counter()
        for i in range(self.cluster_num):
            self.crawl_history[i] = 1

        write_file = open("./results/sampling/random_{0}_{1}_size{2}.txt".format(method,self.dataset,self.crawl_size),"w")
        num_web_crawl=0
        entry, prefix = self.entry, self.prefix
        self.url_stack,self.crawl_length = [entry],0
        self.final_list, url_list, last_list = [], [], []
        size, num = num_crawl,  0 # the number of crawling
        s = sampler(self.dataset,self.entry,self.prefix,0)
        while(num<size and len(self.url_stack) >0):
            first_url = self.url_stack[0]
            print "first_url", first_url
            try:
                sys.stdout.write("num is {}\n".format(num))
                sys.stdout.flush()
                #print num, "num"
                url_list,cluster_id = self.sample_link(first_url,s,method)
                if first_url not in self.history_set:
                    self.final_list.append((first_url,cluster_id))
                    num += 1
                # add url to sample history anyway
                self.crawl_history[cluster_id] += 1
                self.crawl_length += 1

            except:
                print "might miss somthing here"
                traceback.print_exc()
                flag = s.crawlUrl(first_url,self.dataset,self.url_stack,self.history_set)
                if flag == 1:
                    sys.stdout.write("num is {}\n".format(num))
                    sys.stdout.flush()
                    #print num, "num"
                    url_list,cluster_id = self.sample_link(first_url,s,method)
                    print url_list
                    if first_url not in self.history_set:
                        num += 1
                        self.final_list.append((first_url,cluster_id))
                    random_time_s = random.randint(5, 10)
                    time.sleep(random_time_s)
                    #num_web_crawl += 1
                    if num_web_crawl%10 == 9:
                        random_time_s = random.randint(60, 90)
                        time.sleep(random_time_s)
                else:
                    #change the first_url from parent sampling
                    print num, "num"
                    traceback.print_exc()
                    pass

            if self.url_stack[0] == first_url:
                self.url_stack.pop(0)
                self.history_set.add(first_url)

            probability = 0.15
            if method=="uniform":
                # after processing, 0.15 random and 0.85 uniform sampling
                # no out-links - random sample
                if random.random() < probability:
                    self.select_from_history_set()
                else:
                    try:
                        print url_list
                        id = random.randrange(len(url_list))
                        self.url_stack.append(url_list[id])
                        print url_list[id], "select from out-links"
                    except:
                        self.select_from_history_set()

            elif method == "pagerank":
                if random.random() < probability:
                    url = random.sample(self.history_set,1)[0]
                    self.url_stack.append(url)
                    print url, "random sampled from history set"
                else:
                    try:
                        id = self.sample_from_dist(url_list,pr_dict,avg_pr)
                        self.url_stack.append(url_list[id])
                        print url_list[id], "select from out-links"
                    except:
                        self.select_from_history_set()
            elif method == "indegree":
                print "sample from orcacle indegree"
                if random.random() < probability:
                    url = random.sample(self.history_set,1)[0]
                    self.url_stack.append(url)
                    print url, "random sampled from history set"
                else:
                    try:
                        id = self.sample_from_dist(url_list,indegree_dict,avg_indegree)
                        self.url_stack.append(url_list[id])
                        print url_list[id], "select from out-links"
                    except:
                        traceback.print_exc()
                        self.select_from_history_set()

            else: # our method
                if random.random() < probability:
                    url = random.sample(self.history_set,1)[0]
                    self.url_stack.append(url)
                    print url, "random sampled from history set"
                else:
                    try:
                        id = self.sample_from_prob_list(url_list)
                        self.url_stack.append(url_list[id][0])
                        print url_list[id], "select from out-links"
                    except:
                        traceback.print_exc()
                        self.select_from_history_set()


        print len(self.final_list), "length of final list"
        for pair in self.final_list:
            url, cluster_id = pair[0],pair[1]
            write_file.write(url + "\t"+ str(cluster_id) + '\n')
Пример #11
0
    def crawling(self,crawl_size=1000):
        if not os.path.exists("./results/irobot/"):
            os.mkdir("./results/irobot/")
        write_file = open("./results/irobot/{0}_irobot_size{1}.txt".format(self.dataset,crawl_size),"w")
        entry,prefix = self.entry, self.prefix
        self.url_stack  = [(entry,"",0)] #(entry,parent_url,crawl_level)
        self.final_list = []
        size, num = crawl_size, 0 # the number of crawling
        crawl_id = 0
        s = sampler(self.dataset,self.entry,self.prefix,0)
        end = 0
        num_web_crawl = 0
        while(num<size and len(self.url_stack)>0):
            print self.url_stack[-1]
            print self.url_stack[0]
            first_url = self.url_stack[end][0]
            parent_url = self.url_stack[end][1]
            crawl_level = self.url_stack[end][2]

            try:
                print "first url is ",first_url

            except:
                traceback.print_exc()

            if first_url not in self.history_set:
                num += 1
                try:
                    url_list = self.crawl_link(first_url, crawl_level, self.history_set, s)
                    print "url list", len(url_list)
                    self.url_stack.pop(end)
                    self.url_stack += url_list
                    self.final_list.append((first_url,parent_url,crawl_level))
                except:
                    print "might miss somthing here"
                    traceback.print_exc()
                    flag = self.crawlUrl(first_url,self.dataset,self.url_stack,self.history_set)
                    if flag == 1:
                        url_list = self.crawl_link(first_url, crawl_level,  self.history_set,s )
                        self.url_stack.pop(end)
                        print "url list", len(url_list)
                        self.url_stack += url_list
                        self.final_list.append((first_url,parent_url,crawl_level))
                        random_time_s = random.randint(5, 10)
                        time.sleep(random_time_s)
                        num_web_crawl += 1
                        if num_web_crawl%10 == 9:
                            random_time_s = random.randint(60, 90)
                            time.sleep(random_time_s)
                    else:
                        num -= 1
                        print "crawl failure"
            else:
                self.url_stack.pop(end)

            end = random.choice([0,-1])
            print "end is ", end
            crawl_id += 1
            print " num is {}".format(num)
            sys.stdout.flush()
            if num >= size:
                print "crawl_id is {0} for size {1}".format(crawl_id,size)

            self.history_set.add(first_url)
        print len(self.final_list), "length of final list"

        for pair in self.final_list:
            url, parent_url, crawl_level = pair[0],pair[1],pair[2]
            write_file.write(url + "\t" + str(parent_url) +"\t" + str(crawl_level) + '\n')