예제 #1
0
파일: key.py 프로젝트: wpfjtmwls/amazen
def run(fname):
  stopws = stopwords.words('english')
  bf = pybloom_live.BloomFilter(capacity=100000, error_rate=0.001)
  with open("data/" + fname, "r") as f:
    start = time.time()
    print("Starting {0}.".format(fname))
    for i, l in enumerate(f):
      reviews_dict = defaultdict(int)
      scores_dict = defaultdict(float)
      count_dict = defaultdict(int)
      revs = json.loads(l)["allReviews"]
      for rev in revs:
        score = rev[0]
        wlist = break_rev(rev[1], stopws, bf)
        for w in wlist:
          count = r.hget("#counts", w)
          if count is None: continue
          if int(count) < 1000 or int(count) > 7000: continue
          reviews_dict[w] = reviews_dict[w] + 1
          scores_dict[w] = scores_dict[w] + score
          count_dict[w] = count_dict[w] + 1
      for key in reviews_dict:
        reviews_dict[key] = reviews_dict[key] / int(r.hget("#counts", key))
      results = [(x[0], x[1], (scores_dict[x[0]] / count_dict[x[0]]), count_dict[x[0]]) for x in reviews_dict.items()]
      results.sort(key=lambda x: x[1])
      if 'title' in l:
        print(json.loads(l)["title"])
        print(list(map(lambda x: x[0], results[::-1][:10])))
      if i % 100 == 0:
        print("{2} {0} products, refreshing bf, took {1} seconds since last print".format(i, time.time() - start, fname))
        bf = pybloom_live.BloomFilter(capacity=100000, error_rate=0.001)
        start = time.time()
      if i == 10: break
예제 #2
0
def initialize_bloom_filter(item, bloom_error_rate):
    md5, tokens = item
    bloom_filter = pybloom_live.BloomFilter(capacity=len(set(tokens)),
                                            error_rate=bloom_error_rate)
    for token in tokens:
        bloom_filter.add(token)
    return (md5, bloom_filter)
예제 #3
0
파일: 12.py 프로젝트: sbcheng/webrobot
def queue_arrange():
    m = 0
    url_total = []
    url_total = list_open.url_list_read()
    url_next_num.put(url_total[len(url_total) - 1])
    b = pybloom_live.BloomFilter(capacity=90000000, error_rate=0.01)
    with open('urllist.txt', 'r') as file_object:
        for line in file_object:
            b.add(line.rstrip())
    while 1:
        try:
            url_current = []
            if not url_current_num.empty(
            ):  #对url_total队列进行去重,并将url_current_num压入总列表和队列
                while not url_current_num.empty():
                    url_current.append(url_current_num.get())

                for num1 in url_current:
                    if num1 not in b:
                        #url_total.append(num1)
                        url_next_num.put(num1)
                        b.add(str(num1))
                m = m + 1
                print('url管理进程运行了%d次' % (m))
                print('total列表有%d个元素' % (len(url_total)))
                #b.sync()
                #time.sleep(2)
        except Exception:
            continue
예제 #4
0
def run(fname):
    out_file = "wordset/" + fname + "-wordset"
    stopws = set(stopwords.words('english'))
    bf = pybloom_live.BloomFilter(capacity=1000000, error_rate=0.001)

    with open("data/" + fname, "r") as f:
        with open(out_file, "w") as out_f:
            # Timer
            start = time.time()
            print("Starting {0}.".format(fname))
            tokenized_words = set()
            for i, l in enumerate(f):
                d = json.loads(l)
                # Break reviews
                revs = d["allReviews"]
                for rev in revs:
                    for w in break_string(rev[1], stopws, bf):
                        tokenized_words.add(w)
                # Break title
                if "title" in d:
                    for w in break_string(d["title"], stopws, bf):
                        tokenized_words.add(w)
                # Break desc
                if "description" in d:
                    for w in break_string(d["description"], stopws, bf):
                        tokenized_words.add(w)
                # Timer
                if (i + 1) % 100 == 3:
                    bf = pybloom_live.BloomFilter(capacity=1000000,
                                                  error_rate=0.001)
                    print(
                        "{2} {0} products, refreshing bf, took {1} seconds since last print"
                        .format(i,
                                time.time() - start, fname))
                    start = time.time()
            for w in tokenized_words:
                out_f.write(w)
                out_f.write("\n")
    def load_bfobj(self):
        """load bloom file obj"""
        files_list = os.walk(defaults.BLOOM_FILE_PATH)
        files_list_str = str(list(files_list))
        # bloom file not exist --> first run
        if defaults.BLOOM_FILE_NAME not in files_list_str:
            print('init')
            return pybloom_live.BloomFilter(capacity=self.capacity,
                                            error_rate=self.error_rate)

        # bloom file exist
        with open(defaults.BLOOM_FILE_PATH + defaults.BLOOM_FILE_NAME,
                  'rb') as fp:
            print('load_bfobj',
                  defaults.BLOOM_FILE_PATH + defaults.BLOOM_FILE_NAME)
            return pybloom_live.BloomFilter.fromfile(fp)
예제 #6
0
파일: spider.py 프로젝트: src-kun/hunting
    def __init__(self,
                 url,
                 deep=4,
                 thread_num=100,
                 url_beyond=100,
                 suspend=5,
                 timeout=5,
                 capacity=1000000,
                 error_rate=0.001):

        # 爬虫起始url
        self._url = url
        if url.endswith('/'):
            self._url = url[:-1]

        # 顶级域名
        self.top_domain = tldextract.extract(url).domain
        # 所有爬取到url
        self.__urls_dict = {}

        # 临时存储URL数据 {'url链接':[此链接中提取到的URL]}
        self._urls_dict_tmp = {}
        self.urls_dict_tmp = url

        # 爬虫深度
        self.deep = deep
        self.bloom = pybloom_live.BloomFilter(capacity=capacity,
                                              error_rate=error_rate)

        self._req_tds = []

        # 超出个数时暂停
        self.url_beyond = url_beyond
        # 暂停时间
        self.suspend = suspend
        # 线程数
        self.thread_num = thread_num
        # 请求超时时间
        self.timeout = timeout
예제 #7
0
class Vieclam24hQlSpider(scrapy.Spider):
    name = 'vieclam24h_QL'
    start_urls = [
        'https://vieclam24h.vn/tim-kiem-viec-lam-nhanh/?hdn_nganh_nghe_cap1=&hdn_dia_diem=&hdn_tu_khoa=&hdn_hinh_thuc=&hdn_cap_bac=',
    ]
    ur = pybloom_live.BloomFilter(capacity=2097152, error_rate=0.005)
    collection_name = 'News'
    client = pymongo.MongoClient(settings.MONGO_URI)
    db = client[settings.MONGO_DATABASE]
    collection = db[collection_name]
    Y_in_db = list(
        collection.find({}, {
            "title": 1,
            "company": 1,
            "address": 1,
            "_id": 0
        }))
    no_duplicate_items = 0

    def parse(self, response):

        for tn in response.xpath('//div[@class="list-items "]/div/div/span'):
            src = tn.xpath('a/@href').extract_first()
            src = response.urljoin(src)
            add_url = self.ur.add(src)
            if add_url is False:
                yield scrapy.Request(src, callback=self.parse_src)

        next_pages = response.xpath('//li[@class="next"]/a/@href').extract()
        next_page = next_pages[len(next_pages) - 1]

        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

    def parse_src(self, response):
        self.item = JobItem()
        self.item["url"] = response.request.url
        title = response.xpath(
            '//div[@class="col-xs-12"]/h1[@class="text_blue font28 mb_10 mt_20 fws title_big"]/text()'
        ).extract()
        x_title = title[0]
        self.item["title"] = x_title
        #Cong ty
        company = response.xpath(
            '//p[@class="font16"]//a[@class="text_grey3"]/text()').extract()
        x_company = company[0]
        self.item['company'] = x_company
        #Noi lam viec
        addresses = response.xpath(
            '//span[@class="pl_28"]//a[@class="job_value text_pink"]/text()'
        ).extract()
        address = ', '.join([
            address.replace("Việc làm", "").replace("TP.HCM",
                                                    "Hồ Chí Minh").strip()
            for address in addresses
        ])
        self.item['address'] = address

        #Check duplicate
        data_need_check = DataReduction(
            3, [[job['title'], job['company'], job['address']]
                for job in self.Y_in_db])
        if data_need_check.is_match([x_title, x_company, address]):
            self.no_duplicate_items += 1
            print(self.no_duplicate_items)
            return

        #Luong
        salary = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Mức lương")]/span/text()'
        ).extract()
        if len(salary) > 0:
            salary_str = " ".join(salary)
            salary_need_normalize = Normalize_salary()
            salary_normalized = salary_need_normalize.normalize_salary(
                salary_str)
            self.item["salary"] = salary_normalized
        else:
            self.item["salary"] = 'Thỏa thuận'
            pass

        #Kinh nghiem
        experience = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Kinh nghiệm")]/span/text()'
        ).extract()
        if len(experience) > 0:
            self.item["experience"] = experience[0]
        else:
            self.item["experience"] = 'Không yêu cầu'
        # Bang cap
        diploma = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Yêu cầu bằng cấp")]/span/text()'
        ).extract()
        if len(diploma) > 0:
            self.item["diploma"] = diploma[0]
        else:
            self.item['diploma'] = 'Không yêu cầu'
        #So luong
        amount = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Số lượng cần tuyển")]/span/text()'
        ).extract()
        if len(amount) > 0:
            self.item["amount"] = amount[0]
        else:
            self.item['amount'] = 'Không yêu cầu'
        #Nganh nghe
        career = response.xpath(
            '//div[@class="line-icon mb_12"]//h2[contains(text(),"Ngành nghề")]//a/text()'
        ).extract()
        career_need_nomarlize = Normalize_careers()
        career_normalized = career_need_nomarlize.normalize_careers(career)
        self.item["career"] = career_normalized

        # Chuc vu
        position = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Chức vụ")]/span/text()'
        ).extract()
        if len(position) > 0:
            self.item["position"] = position[0]
        else:
            self.item['position'] = 'Không yêu cầu'

        #Hinh thuc lam viec fulltime/parttime
        category = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Hình thức làm việc")]/span/text()'
        ).extract()
        if len(category) > 0:
            self.item["category"] = category[0]
        else:
            self.item['category'] = 'Không yêu cầu'
        #Thoi gian thu viec
        trial_time = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Thời gian thử việc")]/span/text()'
        ).extract()
        if len(trial_time) > 0:
            self.item["trial_time"] = trial_time[0]
        else:
            self.item['trial_time'] = 'Không yêu cầu'
        #Yeu cau gioi tinh
        sex = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Yêu cầu giới tính")]/span/text()'
        ).extract()
        if len(sex) > 0:
            self.item["sex"] = sex[0]
        else:
            self.item['sex'] = 'Không yêu cầu'
        #Yeu cau tuoi
        age = response.xpath(
            '//div[@class="col-xs-6"]//p//span[contains(text(),"Yêu cầu độ tuổi")]/span/text()'
        ).extract()
        if len(age) > 0:
            self.item["age"] = age[0]
        else:
            self.item['age'] = 'Không yêu cầu'

        #Mo ta
        description = response.xpath(
            '(//div[@id="ttd_detail"]//div[@class="item row"])[1]//p[@class="col-md-9 pr_0 mb_0 word_break"]/text()'
        ).extract()
        self.item["description"] = " ".join(
            [des.strip() for des in description])
        #Quyen loi duoc huong
        benefits = response.xpath(
            '(//div[@id="ttd_detail"]//div[@class="item row"])[2]//p[@class="col-md-9 pr_0 mb_0 word_break"]/text()'
        ).extract()
        self.item["benefits"] = " ".join(
            [benefit.strip() for benefit in benefits])
        #Yeu cau khac
        require_skills = response.xpath(
            '(//div[@id="ttd_detail"]//div[@class="item row"])[3]//p[@class="col-md-9 pr_0 mb_0 word_break"]/text()'
        ).extract()
        self.item["require_skill"] = " ".join(
            [require_skill.strip() for require_skill in require_skills])
        #Thong tin lien he
        per_contact = response.xpath(
            '(//div[@class="job_description bg_white pl_24 pr_24 mt_16 pb_18 box_shadow"]//div[@class="item row pt_14 pb_14"])[1]//p[@class="col-md-9 pr_0 mb_0"]/text()'
        ).extract()
        add_contact = response.xpath(
            '(//div[@class="job_description bg_white pl_24 pr_24 mt_16 pb_18 box_shadow"]//div[@class="item row pt_14 pb_14"])[2]//p[@class="col-md-9 pr_0 mb_0"]/text()'
        ).extract()

        contact = u"Người liên hệ: " + per_contact[0].strip(
        ) + u" Địa chỉ liên hệ: " + add_contact[0].strip()
        self.item["contact"] = contact

        #Han nop ho so
        expired = response.xpath(
            '(//span[@class="text_pink"])[1]/text()').extract()  #Het han
        if len(expired) > 0:
            self.item["expired"] = expired[0]
            pass
        #Ngay tao hoso
        created = response.xpath(
            '(//p[@class="text_grey2 font12 mt8 mb12"]//span)[3]/text()'
        ).extract()  #Ngay tao
        if len(created) > 0:
            created_at = created[0][14:]
            self.item["created"] = created_at
        if self.item["title"] != "":
            yield self.item
예제 #8
0
class CareerbuilderSpider(scrapy.Spider):
    name = 'careerbuilder'
    start_urls = ['https://careerbuilder.vn/viec-lam/tat-ca-viec-lam-vi.html/']
    ur = pybloom_live.BloomFilter(capacity=2097152, error_rate=0.005)
    collection_name = 'News'
    # Y_in_db = []
    client = pymongo.MongoClient(settings.MONGO_URI)
    db = client[settings.MONGO_DATABASE]
    collection = db[collection_name]
    Y_in_db = list(collection.find({}, {"title":1,"company":1, "address":1, "_id":0}))
    no_duplicate_items = 0
    def parse(self, response):
        # client = pymongo.MongoClient(self.settings.get("MONGO_URI"))
        # db = client[self.settings.get("MONGO_DATABASE")]
        # collection = db[self.collection_name]
        # self.Y_in_db = list(collection.find({}, {"title":1,"company":1, "address":1, "_id":0}))
        # Time start
        for tn in response.xpath('//h3[@class="job"]'):
            src = tn.xpath('a/@href').extract_first()
            src = response.urljoin(src)
            add_url = self.ur.add(src)
            if add_url is False:
                yield scrapy.Request(src, callback=self.parse_src) 
                pass


        next_pages = response.xpath('//a[@class="right"]/@href').extract()
        next_page = next_pages[len(next_pages) - 1]
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)
    def parse_src(self, response):
        self.item = JobItem()
        self.item["url"] = response.request.url
         #Nganh nghe
        career= response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Ngành nghề')]/following-sibling::b/a/text()").extract() 
        print(career)
        career_need_nomarlize = Normalize_careers()
        career_normalized = career_need_nomarlize.normalize_careers(career)
        self.item["career"] = career_normalized

        #Tieu de
        title = response.xpath('//div[@class="top-job-info"]/h1/text()').extract()
        x_title = title[0]
        self.item["title"] = x_title
        company = response.xpath('//div[@class="tit_company"]/text()').extract()
        x_company = company[0]
        self.item["company"] = x_company
         #Noi lam viec
        address = response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Nơi làm việc')]/following-sibling::b/a/text()").extract()
        add = ", ".join(address)
        self.item["address"] = ", ".join(address)
         
        data_need_check = DataReduction(3, [[job['title'], job['company'], job['address']] for job in self.Y_in_db])      
        
        #Check duplicate
        if data_need_check.is_match([x_title, x_company, add]):
            self.no_duplicate_items += 1
            print(self.no_duplicate_items)
            return
        
    	# Luong
        salary = response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Lương')]/following-sibling::label/text()").extract()

        if len(salary) > 0:
            salary_str = " ".join(salary)
            salary_need_normalize = Normalize_salary()
            salary_normalized = salary_need_normalize.normalize_salary(salary_str)
            self.item["salary"] = salary_normalized
        else:
            self.item["salary"] = 'Thỏa thuận'
            pass
    	#Kinh nghiem    
        experience = response.xpath("//div[@id='showScroll']/ul/li/p/span[contains(text(), 'Kinh nghiệm')]/../text()").extract()
        if len(experience) > 0:
            self.item["experience"] = experience[0].strip() 
        elif not experience:
            self.item["experience"] = "Không yêu cầu"
        descriptions =  response.xpath("//div[@class='MarBot20']/h4[contains(text(),'Mô tả Công việc')]/following-sibling::div[@class='content_fck']//text()").extract()
        self.item["description"] = ' '.join([description.replace('-', '').strip() for description in descriptions])
        #Thong tin khac bao gom bang cap, do tuoi, va hinh thuc lam viec
        info_others = response.xpath("//div[@class='MarBot20']/h4[contains(text(),'Thông tin khác')]/following-sibling::div[@class='content_fck']/ul/li//text()").extract()
        # Bang cap
        diploma = [info_other.strip() for info_other in info_others if
                   "bằng cấp" in info_other.lower() or "tốt nghiệp" in info_other.lower()] + [
                      description.strip() for description in descriptions if "bằng cấp" in description.lower() or "tốt nghiệp" in description.lower()]
        if len(diploma) > 0:
            self.item['diploma'] = diploma[0].split(':')[-1].strip()
        else:
            self.item['diploma'] = 'Không yêu cầu'
         #So 0
        amount = ""
        self.item["amount"] = amount
        
        
         # Chuc vu   
        position = response.xpath('(//p[@class="fl_right"])[1]//label/text()').extract()
        if len(position) > 0:
            self.item["position"] = position[0]
            pass

        #Hinh thuc lam viec fulltime/parttime
        category = [info_other.strip() for info_other in info_others if "hình thức" in info_other.lower()] + [description.strip() for description in descriptions if "hình thức" in description.lower()]
        if len(category) > 0:
            self.item['category'] = category[0].split(':')[-1].strip()
        else:
            self.item['category'] = 'Không yêu cầu'
        #Thoi gian thu viec
        trial_time = [info_other.strip() for info_other in info_others if "thời gian thử việc" in info_other.lower()]
        if len(trial_time) > 0:
            self.item["trial_time"] = trial_time
        else:
            self.item['trial_time'] = 'Không yêu cầu'
        #Yeu cau gioi tinh
        sex_male = "Nam" if len([info_other for info_other in info_others if "nam" in info_other.lower()]) > 0 else ""
        sex_female = "Nữ" if len([info_other for info_other in info_others if "nữ" in info_other.lower()]) > 0 else ""
        sex = sex_male + ("", "/")[sex_male != "" and sex_female != ""] + sex_female
        if sex == "":
            self.item['sex'] = "Không yêu cầu"
        else:
            self.item['sex'] = sex.strip()
        ages = [other.strip() for other in info_others if "tuổi" in other] + [description.strip() for description in
                                                                         descriptions if "tuổi" in description]
        if len(ages) > 0:
            self.item["age"] = ages[0].split(":")[-1].strip()
        else:
            self.item["age"] = 'Không yêu cầu'

        #Mo ta
        
        #Quyen loi duoc huong
        benefits = response.xpath("//div[@class='MarBot20 benefits-template']/h4[contains(text(),'Phúc lợi')]/following-sibling::ul/li/text()").extract()
        self.item["benefits"] = ', '.join(benefits).strip()
        #Yeu cau khac
        require_skills = response.xpath("//div[@class='MarBot20']/h4[contains(text(),'Yêu Cầu Công Việc')]/following-sibling::div[@class='content_fck']//text()").extract()
        self.item["require_skill"] = " ".join([skill.replace('-', '').strip() for skill in require_skills])

        #Thong tin lien he
        per_contact = response.xpath('(//p[@class="TitleDetailNew"]//label)[3]//strong/text()').extract()
        add_contact = response.xpath('(//p[@class="TitleDetailNew"]//label)[2]/text()').extract()
        if len(per_contact) > 0 :
            pers_contact = re.sub(r'<.*?>', ' ', per_contact[0])
            pers_contact = re.sub(r'\n', ' ', pers_contact)
            pers_contact = re.sub(r'\r', ' ', pers_contact)
            pass
        else:
            pers_contact = ""
        if len(add_contact) > 0:
            addr_contact = re.sub(r'<.*?>', ' ', add_contact[0])
            addr_contact = re.sub(r'\n', ' ', addr_contact)
            addr_contact = re.sub(r'\r', ' ', addr_contact)
            pass
        else:
            addr_contact = ""
        # contact = pers_contact + "\n" +addr_contact
        contact = u"Người liên hệ: " + pers_contact.strip() + u" Địa chỉ liên hệ: " + addr_contact.strip()
        self.item["contact"] = contact
        
        #Han nop ho so
        expired = response.xpath('(//p[@class="fl_right"])[3]/text()').extract() #Het han
        if len(expired) > 0:
            self.item["expired"] = expired[0]
            pass
        #Ngay tao hoso
        created = response.xpath('//div[@class="datepost"]//span/text()').extract() #Ngay tao
        if len(created) > 0:
            self.item["created"] = created[0]
        if self.item["title"] != None:
            yield self.item
예제 #9
0
import pybloom_live
import requests

if __name__ == '__main__':
    texts = requests.get('http://www.gutenberg.org/files/2852/2852-0.txt')
    bloom = pybloom_live.BloomFilter(capacity=10000, error_rate=0.1)
    for each in texts.text.split():
        bloom.add(each)

    print(len(bloom))
    print('the' in bloom)
    print('luo_wei_' not in bloom)
예제 #10
0
    'connection': "keep-alive",
    'cache-control': "no-cache",
    'upgrade-insecure-requests': "1",
    'user-agent':
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
    'accept':
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
}

city_home_pages = []
city_ids = []
dirname = 'mafengwo_notes/'

# 创建 Bloom Filter
download_bf = pybloom_live.BloomFilter(1024 * 1024 * 16, 0.01)


def download_city_notes(id):
    for i in range(1, 999):  #遍历每个城市的每一页游记,再下载每一页的游记
        url = 'http://www.mafengwo.cn/yj/%s/1-0-%d.html' % (id, i)
        if url in download_bf:
            continue
        print("open url %s" % url)
        download_bf.add(url)
        req = urllib.request.Request(url, headers=request_headers)
        response = urllib.request.urlopen(req)
        htmlcontent = response.read().decode("utf-8")
        city_notes = re.findall(r'href="/i/\d{7}.html', htmlcontent)

        # 如果导航页错误,该页的游记数为0,则意味着 1-0-xxx.html 已经遍历完,结束这个城市