예제 #1
0
 def dispatch_loop(self):
     while True:
         if os.path.isdir('stop_spider'):
             print('\nStopping spider crawling...')
             break
         else:
             print('\n',
                   time.strftime('%m/%d/%Y'),
                   time.strftime('%H:%M:%S'),
                   ' Disk space ',
                   end='')
             total, used, free = shutil.disk_usage('.')
             total = int(total / 1024 / 1024 / 1024)
             used = int(used / 1024 / 1024 / 1024)
             free = int(free / 1024 / 1024 / 1024)
             print_prog(' ' * 5 + 'used',
                        used,
                        total,
                        left_just=5,
                        bar_length=20,
                        endwith='')
             print_prog(' free', free, total, left_just=5, bar_length=20)
             print('Getting fresh proxies...')
             #get_proxies()
             result = Popen(
                 ('scrapy crawl %s' % cfg.SPIDER_LIST[0]).split(' '))
             result.wait()
예제 #2
0
    def start_requests(self):
        start_links = []
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                                 AppleWebKit/537.36 (KHTML, like Gecko) \
                                 Chrome/66.0.3359.139 Safari/537.36'
        }

        for cat in db.cursordb.find({'dist': DigikeySpider.name}):
            if not cat['scan_complete']:
                start_links.append(cat['current_link'])
                DigikeySpider.total_count += cat['total_count']
                DigikeySpider.prog_count += cat['current_count']

        # self.perc_total = int(self.scan_perc*(DigikeySpider.total_count - DigikeySpider.prog_count))
        print_prog(self.name + ' Total progress',
                   DigikeySpider.prog_count,
                   DigikeySpider.total_count,
                   left_just=20,
                   endwith='')
        print_prog(' ' * 5 + 'Progress',
                   DigikeySpider.batch_count,
                   cfg.BATCH_SIZE,
                   left_just=5,
                   bar_length=20,
                   endwith='\r')
        for start_link in start_links:
            yield scrapy.Request(start_link, headers=headers)
예제 #3
0
    def update_counts(self, response):
        ucat = response.meta['item']['cat_name'] + '__' + response.meta['item']['subcat_name']
        db.cursordb.update_one({'dist': DigikeyCount.name,'ucat': ucat},
                               {'$setOnInsert': {'dist': DigikeyCount.name.split('_')[0],
                                                 'ucat': ucat},
                                '$set': {'start_link': response.url,
                                         'current_link': response.url,
                                         'current_count': 0,
                                         'total_count': int(response.xpath(xp.CAT_COUNT).extract_first().replace(',', '')),
                                         'scan_complete': False}}, upsert=True)

        DigikeyCount.subcats_count += 1
        print_prog('Progress', DigikeyCount.subcats_count,
                   DigikeyCount.total_subcats, left_just=15, endwith='\r')
예제 #4
0
 def update_prog(self, prog_bar=True):
     counts = db.prepdb.find_one({
         'dist': self.name.split('_')[0],
         'current_count': {
             '$exists': 1
         }
     })
     if prog_bar:
         print_prog(self.name + ' progress',
                    counts['current_count'],
                    counts['total_count'],
                    left_just=20,
                    endwith='\r')
     return (counts['current_count'], counts['total_count'])
예제 #5
0
    def start_requests(self):
        start_links = []
        parts = db.metadb.find({'processed': 'error'})
        PdfDatasheetAgent.pdf_totcnt = parts.count()
        print_prog('Agenting',
                   PdfDatasheetAgent.pdf_cnt,
                   PdfDatasheetAgent.pdf_totcnt,
                   left_just=5,
                   bar_length=20,
                   endwith='\r')
        for part in parts:
            start_links.append(cfg.PDF_AGENT_URL +
                               part['upart'].split('__')[0])
            PdfDatasheetAgent.uparts[part['upart'].split('__')
                                     [0]] = part['upart']

        for start_link in start_links:
            yield self.make_requests_from_url(start_link)
예제 #6
0
    def parse(self, response):
        headers = ['import os', 'import sys',
                   "sys.path.append(os.path.join('..', '..', '..', '..'))",
                   'from collections import defaultdict',
                   'import mycat_tree as mt',
                   'class PartTree():',
                   '    cat_tree = defaultdict(dict)']
        with open(os.path.join(self.base_path,
                  'part_tree_%s.py' % time.strftime('%m%d%Y')), 'w') as f:
            for header in headers:
                f.write(header+'\n')
        headers = ["myrootcat = ''\n", "mycat = ''\n",
                  "mysubcat = mt.mytree[myrootcat][mycat]\n\n",
                  "subcats = ["]

        for cat in init.cats:
            cat = '/'+cat+'/'
            DigikeyCount.total_subcats += len(response.xpath(xp.PREP_SUBCAT % cat).extract())
        print_prog('Progress', DigikeyCount.subcats_count,
                   DigikeyCount.total_subcats, left_just=15, endwith='\r')

        with open(os.path.join(self.base_path,
                  'part_tree_%s.py' % time.strftime('%m%d%Y')), 'a') as f:
            for cat in init.cats:
                cat = '/'+cat+'/'
                cat_name = replace_illchar(response.xpath(xp.PREP_CATNAME % cat).extract_first())
                subcat_links = response.xpath(xp.PREP_SUBCAT_LINK % cat).extract()
                subcat_names = response.xpath(xp.PREP_SUBCAT_NAMES % cat).extract()
                
                for header in headers:
                    f.write(' '*4+header)
                
                for subcat_link, subcat_name in zip(subcat_links, subcat_names):
                    f.write(' '*15+"'%s',\n" % replace_illchar(subcat_name))
                    item = catsItem()
                    item['cat_name'] = cat_name
                    item['subcat_name'] = replace_illchar(subcat_name)
                    yield scrapy.Request(url='https://www.digikey.com'+subcat_link,
                                         callback=self.update_counts, meta={'item': item})
                f.write(']\n')
                f.write(' '*4+'mysub_idx = ['+' ,'*(len(subcat_names)-1)+']\n')
                f.write(' '*4+'for idx, subcat in zip(mysub_idx, subcats):\n')
                f.write(' '*8+"cat_tree['%s'][subcat] = {'myrootcat': myrootcat, \
                        'mycat': mycat, 'mysubcat': mysubcat[idx]}\n\n" % cat_name)
예제 #7
0
    def dispatch_loop(self):
        while True:
            if os.path.isdir('.stop_spider'):
                print('\nStopping crawling spider...')
                break
            else:
                print('\n', time.strftime('%m/%d/%Y'), time.strftime('%H:%M:%S'),
                      ' Disk space ', end='')
                total, used, free = shutil.disk_usage('.')
                total = int(total/1024/1024/1024)
                used = int(used/1024/1024/1024)
                free = int(free/1024/1024/1024)
                print_prog(' '*5 + 'used', used, total, left_just=5,
                           bar_length=20, endwith='')
                print_prog(' free', free, total, left_just=5, bar_length=20)

                procs = [Popen(('scrapy crawl %s' % spider_to_run).split())
                         for spider_to_run in cfg.SPIDER_LIST]
                results = [proc.wait() for proc in procs]
예제 #8
0
 def save_pdf(self, response):
     upart = response.meta['upart']
     # print(upart)
     if cfg.KEEP_GRIDFS_PDF:
         db.pdfdb.delete(db.pdfdb.find_one({'filename': upart})._id)
         db.pdfdb.put(response.body, filename=upart)
     db.distdb.update_one(
         {
             'part_num': upart.split('__')[0],
             'manufac': upart.split('__')[1]
         }, {'$set': {
             'pdf_link': response.url
         }})
     db.metadb.update_one({'upart': upart},
                          {'$set': {
                              'processed': 'pending'
                          }})
     PdfDatasheetAgent.pdf_cnt += 1
     print_prog('Agenting',
                PdfDatasheetAgent.pdf_cnt,
                PdfDatasheetAgent.pdf_totcnt,
                left_just=5,
                bar_length=20,
                endwith='\r')
예제 #9
0
    def parse(self, response):
        cat = replace_illchar(response.xpath(xp.CAT).extract_first().strip())
        subcat = response.xpath(xp.SUBCAT).extract_first()
        subcat = replace_illchar(subcat[subcat.rindex(';') +
                                        1:subcat.rindex('<')].strip())
        table_len = len(response.xpath(xp.TABLE_LEN))

        table_heads = {}
        idx = xp.HEAD_START_IDX
        while True:
            head = response.xpath(xp.HEAD_TITLE % idx).extract()
            if head:
                table_heads[idx] = head[0]
                idx += 1
            else:
                break

        for idx in range(1, table_len + 1):
            part = {
                'part_num':
                replace_illchar(
                    response.xpath(xp.PART_NUM % idx).extract_first().strip()),
                'dist_num':
                response.xpath(xp.DIST_NUM % idx).extract_first().strip(),
                'manufac':
                replace_illchar(
                    response.xpath(xp.MANUFAC % idx).extract_first()),
                'descr':
                response.xpath(xp.DESCR % idx).extract_first().strip(),
                'unit_price':
                response.xpath(xp.UNIT_PRICE %
                               idx).extract_first().strip().replace(',', ''),
                'dist':
                self.name,
                'dist_partlink':
                'http://www.digikey.com' +
                response.xpath(xp.DIST_LINK % idx).extract_first(),
                'pdf_link':
                response.xpath(xp.PDF_LINK % idx).extract_first(),
                'min_quan':
                response.xpath(xp.MIN_QUAN %
                               idx).extract_first().strip().replace(',', ''),
                'date_scraped':
                datetime.datetime.utcnow()
            }
            for title_idx, table_title in table_heads.items():
                misc_data = response.xpath(
                    xp.MISC_DATA % (idx, title_idx)).extract_first().strip()
                if misc_data:
                    text = '%s %s' % (table_title, misc_data)
                    # Special case for Digikey package dimensions
                    if 'Size' in text and 'Dimension' in text and 'L' in text and 'W' in text:
                        self.text_to.reset_result()
                        # Explicit decoration
                        self.err_to.logger(text, 'info', part['part_num'])(
                            self.text_to.parse)(text)

                        text = [
                            'Size Dimension Length %s%s' %
                            (self.text_to.result['param'][2][0],
                             self.text_to.result['param'][2][1]),
                            'Size Dimension Width %s%s' %
                            (self.text_to.result['param'][3][0],
                             self.text_to.result['param'][3][1])
                        ]
                        docs = list(
                            self.docs.make_mongoelem(
                                ext_elems_to_analyze=text))
                    else:
                        docs = [
                            list(
                                self.docs.make_mongoelem(
                                    ext_elems_to_analyze=[text]))[0]
                        ]

                    for sub_idx, doc in enumerate(docs):
                        col = '%s.%s' % (title_idx, sub_idx)
                        doc = {
                            **doc,
                            **{
                                'direct': 'v',
                                'manufac': part['manufac'],
                                'page_num': 0,
                                'part_num': part['part_num'],
                                'dist_num': part['dist_num'],
                                'col': col,
                            }
                        }
                        doc['word'] += lp.Parser.get_ngrams(doc['part_num']) \
                                       + lp.Parser.get_ngrams(doc['manufac'])
                        db.partdb.update_one(
                            {
                                'dist_num': part['dist_num'],
                                'col': col
                            }, {'$setOnInsert': doc},
                            upsert=True)
            try:
                if part['unit_price'][0] == '$':
                    part['unit_price'] = float(part['unit_price'][1:])
                else:
                    part['unit_price'] = float(part['unit_price'])
            except IndexError:
                part['unit_price'] = 'See distributer'
            except ValueError:
                pass
            if part['pdf_link']:
                if part['pdf_link'][0:7] == '//media':
                    part['pdf_link'] = 'http:' + part['pdf_link']
            else:
                part['pdf_link'] = ''

            part['partnum_manufac_ngram3'] = lp.Parser.get_ngrams(part['part_num']) \
                                             + lp.Parser.get_ngrams(part['manufac'])

            part['root_cat'] = pt.cat_tree[cat][subcat]['myrootcat']
            part['cat'] = pt.cat_tree[cat][subcat]['mycat']
            part['subcat'] = pt.cat_tree[cat][subcat]['mysubcat']

            db.invendb.update_one(
                {
                    'root_cat': part['root_cat'],
                    'cat': part['cat'],
                    'subcat': part['subcat']
                }, {'$inc': {
                    'total': 1
                }},
                upsert=True)

            always_update = {
                'unit_price': part['unit_price'],
                'min_quan': part['min_quan'],
                'date_scraped': part['date_scraped'],
                'pdf_link': part['pdf_link'],
                'dist_partlink': part['dist_partlink']
            }
            db.distdb.update_one(
                {
                    'part_num': part['part_num'],
                    'manufac': part['manufac']
                }, {
                    '$setOnInsert': {
                        k: v
                        for k, v in part.items()
                        if k not in always_update.keys()
                    },
                    '$set': always_update
                },
                upsert=True)

            db.manufacdb.update_one({'manufac_ngram3': part['manufac']}, {
                '$setOnInsert': {
                    'manufac': part['manufac'],
                    'manufac_ngram3': lp.Parser.get_ngrams(part['manufac'])
                }
            },
                                    upsert=True)

            processed_pdf = db.metadb.find_one(
                {'upart': part['part_num'] + '__' + part['manufac']})

            if part['pdf_link'] and (not processed_pdf
                                     or processed_pdf['processed'] == 'error'):
                item = partsItem()
                item['part_num'] = part['part_num']
                item['manufac'] = part['manufac']
                yield scrapy.Request(url=part['pdf_link'],
                                     callback=self.save_pdf,
                                     meta={'item': item})
            DigikeySpider.prog_count += 1
            DigikeySpider.batch_count += 1

        #self.htmlpage_count += 1
        print_prog(self.name + ' Total progress',
                   DigikeySpider.prog_count,
                   DigikeySpider.total_count,
                   left_just=20,
                   endwith='')
        print_prog(' ' * 5 + 'Progress',
                   DigikeySpider.batch_count,
                   cfg.BATCH_SIZE,
                   left_just=5,
                   bar_length=20,
                   endwith='\r')
        next_page = response.xpath(xp.NEXT_PAGE).extract_first()

        if next_page:
            next_page = 'https://www.digikey.com' + next_page

            db.cursordb.update_one(
                {
                    'ucat': cat + '__' + subcat,
                    'dist': DigikeySpider.name
                }, {
                    '$set': {
                        'current_link': next_page
                    },
                    '$inc': {
                        'current_count': table_len
                    }
                })
            if DigikeySpider.batch_count < cfg.BATCH_SIZE:
                yield scrapy.Request(url=next_page, callback=self.parse)
        else:
            start_link = db.cursordb.find_one({
                'ucat': cat + '__' + subcat,
                'dist': DigikeySpider.name
            })['start_link']
            db.cursordb.update_one(
                {
                    'ucat': cat + '__' + subcat,
                    'dist': DigikeySpider.name
                }, {
                    '$set': {
                        'current_link': start_link,
                        'scan_complete': True
                    },
                    '$inc': {
                        'current_count': table_len
                    }
                })
예제 #10
0
 def parse(self, response):
     if os.path.isdir('.stop_spider'):
         print(f'\nStop crawling spider {self.name}')
         raise scrapy.exceptions.CloseSpider('Stopped by user')
         
     bread_crumbs = response.xpath(xp.BREAD_CRUMBS).extract()
     cat, subcat = bread_crumbs[self.bread_crumb_idx:]
     cat, subcat = replace_illchar(cat), replace_illchar(subcat)
     table_len = len(response.xpath(xp.TABLE_LEN))
     
     table_heads = {}
     idx = xp.HEAD_START_IDX
     while True:
         head = response.xpath(xp.HEAD_TITLE % idx).extract()
         if head:
             table_heads[idx] = head[0].strip()
             idx += 1
         else:
             break
     try:
         for idx in range(self.start_row_idx, table_len+1):
             part = {
                     'part_num': replace_illchar(response.xpath(xp.PART_NUM % idx).extract_first().strip()),
                     'dist_num': response.xpath(xp.DIST_NUM % idx).extract_first().strip(),
                     'manufac': replace_illchar(response.xpath(xp.MANUFAC % idx).extract_first()),
                     'descr': response.xpath(xp.DESCR % idx).extract_first().strip(),
                     'unit_price': replace_illchar(response.xpath(xp.UNIT_PRICE % idx).extract_first().strip()),
                     'dist': self.name,
                     'dist_partlink': urljoin(self.base_url, response.xpath(xp.DIST_LINK % idx).extract_first()),
                     'pdf_link': response.xpath(xp.PDF_LINK % idx).extract_first(),
                     'min_quan': response.xpath(xp.MIN_QUAN % idx).extract_first().strip().replace(',', ''),
                     'date_scraped': datetime.datetime.utcnow()
                 }
             for title_idx, table_title in table_heads.items():
                 misc_data = response.xpath(xp.MISC_DATA % (idx, title_idx)).extract_first().strip()
                 if misc_data:
                     text = f'{table_title} {misc_data}'
                     docs = [list(self.docs.make_mongoelem(ext_elems_to_analyze=[text]))[0]]
                     
                     for sub_idx, doc in enumerate(docs):
                         col = '%s.%s' % (title_idx, sub_idx)
                         doc = {**doc, **{'direct': 'v', 'manufac': part['manufac'],
                                          'page_num': 0, 'part_num': part['part_num'],
                                          'dist_num': part['dist_num'], 'col': col,
                                          }}
                         doc['word'] += get_ngrams(doc['part_num']) \
                                        + get_ngrams(doc['manufac'])
                         db.partdb.update_one({'dist_num': part['dist_num'],
                                               'col': col},
                                               {'$setOnInsert': doc}, upsert=True)
         
             for _ in [('unit_price', float), ('min_quan', int)]:
                 try:
                     part[_[0]] = _[1](part[_[0]])
                 except ValueError:
                     if not part[_[0]]:
                        part[_[0]] = 'See distributer'
         
             if not part['pdf_link']:
                 part['pdf_link'] = ''
     
             part['partnum_manufac_ngram3'] = get_ngrams(part['part_num']) \
                                              + get_ngrams(part['manufac'])
             
             part['root_cat'] = pt.cat_tree[cat][subcat]['myrootcat']
             part['cat'] = pt.cat_tree[cat][subcat]['mycat']
             part['subcat'] = pt.cat_tree[cat][subcat]['mysubcat']
             
             db.invendb.update_one(
                     {'root_cat': part['root_cat'],
                      'cat': part['cat'],
                      'subcat': part['subcat']},
                     {'$inc': {'total': 1}},
                     upsert=True
                 )
         
             always_update = {'unit_price': part['unit_price'],
                              'min_quan': part['min_quan'],
                              'date_scraped': part['date_scraped'],
                              'pdf_link': part['pdf_link'],
                              'dist_partlink': part['dist_partlink']}
             db.distdb.update_one(
                     {'part_num': part['part_num'],
                      'manufac': part['manufac']},
                     {'$setOnInsert': {k: v for k, v in part.items()
                                            if k not in always_update.keys()},
                      '$set': always_update},
                      upsert=True
                 )
         
             db.manufacdb.update_one(
                     {'manufac': part['manufac']},
                     {'$setOnInsert': {'manufac': part['manufac'],
                                       'manufac_ngram3': get_ngrams(
                                                         part['manufac'])}},
                      upsert=True
                 )
             
             processed_pdf = db.distdb.find_one({'part_num': part['part_num'],
                                                 'manufac': part['manufac'],
                                                 'processed': {'$exists': True}})
             
             if part['pdf_link'] and (not processed_pdf
                                      or processed_pdf['processed'] == 'error'):
                 db.distdb.update_one({'part_num': part['part_num'],
                                       'manufac': part['manufac']},
                                      {'$set': {'processed': 'pending', 'page': 0}})
 
                 if cfg.KEEP_GRIDFS_PDF:
                     item = PartsItem()
                     item['part_num'] = part['part_num']
                     item['manufac'] = part['manufac']
                     yield scrapy.Request(url=part['pdf_link'],
                                          callback=self.save_pdf,
                                          meta={'item': item})
                 else:
                     self.logger.info('### Saved PDF link %s__%s ###' % (
                             part['part_num'], part['manufac']
                         ))
                
             self.__class__.prog_count += 1
             self.__class__.batch_count += 1
             print_prog(self.name+' total progress', self.prog_count,
                        self.total_count, left_just=20, endwith='')
             print_prog(' '*5+'Progress', self.batch_count, cfg.BATCH_SIZE,
                        left_just=5, bar_length=20, endwith='\r')
         next_page = response.xpath(xp.NEXT_PAGE).extract_first()
         dup_req = False
     except AttributeError:
         next_page = response.url
         self.logger.debug(f'---> Got invalid data from {response.url}, trying again with {next_page}')
         dup_req = True
             
     if next_page:
         next_page = urljoin(self.base_url, next_page)
         if dup_req: # Don't update count, only link
             db.cursordb.update_one({'ucat': cat+'__'+subcat, 'dist': self.name},
                                    {'$set': {'current_link': next_page}})
         else:
             db.cursordb.update_one({'ucat': cat+'__'+subcat, 'dist': self.name},
                                    {'$set': {'current_link': next_page},
                                     '$inc': {'current_count': table_len}})
                                          
         if self.batch_count < cfg.BATCH_SIZE:
             yield scrapy.Request(url=next_page, callback=self.parse,
                                  dont_filter=dup_req)
     else:
         start_link = db.cursordb.find_one(
                 {'ucat': cat+'__'+subcat,
                  'dist': self.name}
             )['start_link']
         db.cursordb.update_one({'ucat': cat+'__'+subcat, 'dist': self.name},
                                {'$set': {'current_link': start_link,
                                          'scan_complete': True},
                                 '$inc': {'current_count': table_len}})