Exemplo n.º 1
0
 def process_item(self, item, spider):
     """ 人物ではなく団体に贈られた賞を除外する。 """
     if not item['gender']:
         raise DropItem("No gender for %s" % item['name'])
     return item
Exemplo n.º 2
0
 def item_completed(self, results, item, info):
     image_paths = [x['path'] for ok, x in results if ok]
     if not image_paths:
         raise DropItem("Item contains no images")
     item['image_paths'] = image_paths
     return item
Exemplo n.º 3
0
 def process_item(self, item, spider):
     if item['Id'] in self.ids_seen:
         raise DropItem("Duplicate item found: %s" % item)
     else:
         self.ids_seen.add(item['Id'])
         return item
Exemplo n.º 4
0
 def process_item(self, item, spider):
     if spider.name == 'saarland':
         address = u"{} {}".format(item.get('street', ""),
                                   item.get('zip', ""))
         if item.get('email'):
             email = item['email'].replace('mailto:',
                                           '').replace('%40', '@')
         else:
             email = None
         school = School(name=item.get('name'),
                         phone=item.get('telephone'),
                         director=item.get('telephone'),
                         website=item.get('website'),
                         fax=item.get('fax'),
                         email=email,
                         address=address)
     elif spider.name == 'niedersachsen':
         address = u"{} {}".format(item.get('Straße', ""),
                                   item.get('Ort', ""))
         school = School(name=item.get('Schule'),
                         phone=item.get('Tel'),
                         email=item.get('E-Mail'),
                         website=item.get('Homepage'),
                         address=address,
                         id='NDS-{}'.format(item.get('Schulnummer')))
     elif spider.name == 'bayern':
         school = School(name=item.get('Name'),
                         phone=item.get('Telefon'),
                         website=item.get('website'),
                         address=item.get('Anschrift'),
                         id='BAY-{}'.format(item.get('Schulnummer')))
     elif spider.name == 'thueringen':
         school = School(name=item.get('Schulname'),
                         id='TH-{}'.format(item.get('Schulnummer')),
                         address=u"{} {}".format(item.get('Straße'),
                                                 item.get('Ort')),
                         website=item.get('Internet'),
                         email=item.get('E-Mail'),
                         school_type=item.get('Schulart'),
                         provider=item.get('Schulträger'),
                         fax=item.get('Telefax'),
                         phone=item.get('Telefon'))
     elif spider.name == 'schleswig-holstein':
         school = School(name=item.get('Name'),
                         id='SH-{}'.format(item.get('Dienststellen Nr.')),
                         address=u"{} {} {}".format(item.get('Straße'),
                                                    item.get("PLZ"),
                                                    item.get("Ort")),
                         email=item.get('EMail'),
                         school_type=item.get('Organisationsform'),
                         legal_status=item.get('Rechtsstatus'),
                         provider=item.get('Träger'),
                         fax=item.get('Fax'),
                         phone=item.get('Telefon'),
                         director=item.get('Schulleiter(-in)'))
     elif spider.name == 'bremen':
         ansprechpersonen = item['Ansprechperson'].replace(
             'Schulleitung:', '').replace('Vertretung:', ',').split(',')
         item['Schulleitung'] = ansprechpersonen[0]
         item['Vertretung'] = ansprechpersonen[1]
         school = School(name=item.get('name'),
                         address=item.get('Anschrift:'),
                         website=item.get('Internet'),
                         email=item.get('E-Mail-Adresse'),
                         fax=item.get('Telefax'),
                         phone=item.get('Telefon'))
     elif spider.name == 'sachsen':
         school = School(name=item.get('title'),
                         id='SN-{}'.format(
                             item.get('Dienststellenschlüssel')),
                         address=item.get('Postanschrift'),
                         website=item.get('Homepage'),
                         email=item.get('E-Mail'),
                         school_type=item.get('Einrichtungsart'),
                         legal_status=item.get('Rechtsstellung'),
                         provider=item.get('Schulträger'),
                         fax=item.get('Telefax'),
                         phone=item.get('phone_numbers'),
                         director=item.get('Schulleiter'))
     elif spider.name == 'sachsen-anhalt':
         school = School(
             name=item.get('Name'),
             address=item.get('Addresse'),
             website=item.get('Homepage'),
             email=item.get('E-Mail'),
             fax=item.get('Fax'),
             phone=item.get('Telefon'),
         )
     elif spider.name == 'brandenburg':
         school = School(name=item.get('name'),
                         id=item.get('nummer'),
                         address=item.get('Adresse'),
                         website=item.get('Internet'),
                         email=item.get('E-Mail'),
                         school_type=item.get('Schulform'),
                         provider=item.get('Schulamt'),
                         fax=item.get('Fax'),
                         phone=item.get('Telefon'),
                         director=item.get('Schulleiter/in'))
     else:
         return item
         raise DropItem("Missing name in %s" % item)
     return {'info': school, 'item': item}
 def process_item(self, item, spider):
     titulo = item['titulo']
     if ('capsula' not in titulo):
         raise DropItem('No tiene capsula')
     else:
         return item
Exemplo n.º 6
0
 def process_item(self, item, spider):
     if item['caseId'] in self.ids_seen:
         raise DropItem("Contains caseId: %s" % item)
     else:
         self.ids_seen.add(item['caseId'])
     return item
Exemplo n.º 7
0
 def item_completed(self, results, item, info):
     brand_avatar = [x['path'] for ok, x in results if ok]  # ok判断是否下载成功
     if not brand_avatar:
         raise DropItem("Item contains no images")
     item['brand_avatar'] = brand_avatar
     return item
Exemplo n.º 8
0
 def process_item(self, item, spider):
     if self.has(item):
         self.logger.info('skipping id {}'.format(item['id']))
         raise DropItem()
     return item
Exemplo n.º 9
0
 def process_item(self, item, spider):
     if not item['rate']:
         raise DropItem("Missing rate in %s" % item)
     item['rate'] = item['rate'].strip('%')
     return item
Exemplo n.º 10
0
                    tx.execute(sql2)
                    log.msg("update item : %s" % item, level=log.WARNING)

                except MySQLdb.OperationalError, e:
                    loger.error(e)
                    loger.error(u'UPDATE failed,OperationalError:%s' % item)
                except MySQLdb.ProgrammingError, e:
                    loger.error(e)
                    loger.error(u'UPDATE failed,ProgrammingError:%s' % item)
                except MySQLdb.DatabaseError, e:
                    loger.error(e)
                    loger.error(u'UPDATE failed,DatabaseError:%s' % item)

        except KeyError:
            logger.error('error item:%s' % item)
            DropItem(u'missing thread_id:%s' % item)

    def _conditional_insert2(self, tx, item):
        # create record if doesn't exist.
        # all this block run on it's own thread
        sql1 = 'set names utf8mb4'
        tx.execute(sql1)
        # 判断item是都否缺失项目,如果缺失就放弃该项
        try:

            args = (item['user_id'], item['user_name'])
            sql = "insert into user(user_id,user_name) VALUES(%s,'%s')" % args
            try:
                #执行插入操作
                tx.execute(sql)
                log.msg("Item stored in db: %s" % item, level=log.INFO)
Exemplo n.º 11
0
 def process_item(self, item, spider):
     if item['title'] in self.links_seen:
         raise DropItem("Duplicate item found: %s" % item)
     else:
         self.links_seen.append(item['title'])
         return item
 def process_item(self, item, spider):
     if any(pd.isnull(list(item.values()))):
         raise DropItem("Missing values!")
     else:
         return item
Exemplo n.º 13
0
 def process_item(self, item, spider):
     if hasattr(spider, 'start_date'):
         if spider.start_date > item.get('date'):
             raise DropItem('Droping all items before {}'.format(
                 spider.start_date))
     return item
Exemplo n.º 14
0
    def process_item(self, item, spider):
        match = re.match("item\?id=[0-9]+", item["url"])
        if match:
            raise DropItem("Excluded self-post: " + item["url"])

        return item
Exemplo n.º 15
0
    def process_item(self, item, spider):
        # print("item::::::::::::::::::::::::", item)

        # Item types
        item_type_tags = {
            'dress': ['dress', 'one-piece'],
            'skirts': ['skirt', 'bottoms'],
            'shorts': ['shorts', 'bottoms'],
            'tops': ['top'],
            'pants': ['pants', 'bottoms'],
            'romper': ['romper', 'one-piece'],
            'one-piece': ['one-piece'],
            'onepiece': ['one-piece'],
            'blouse': ['top'],
            'culottes': ['pants', 'bottoms'],
        }

        for key in item:
            if key == 'itemName':
                tree = lxml.html.fromstring(
                    item[key]
                )  # removes html tags from product name.Some have <br>, making it unable to use .text()
                item[key] = tree.text_content().strip()

            if key == 'itemPrice':
                if item[key] is None:
                    print("no promo price, set as 0 dollar")
                    item[key] = "SGD0"
                if 'SGD' in item[key]:
                    item[key] = item[key].replace('SGD', '').strip()

            if key == 'itemType':
                for cat_name in item_type_tags:
                    if cat_name in item['itemUrl']:
                        item[key] = item_type_tags[cat_name]
                        break
                    if cat_name in item['pageName']:
                        item[key] = item_type_tags[cat_name]
                        break
                    if item[key] == "":
                        item[key] = ['others']

            if key == 'itemImageUrl':
                if item[key] is None:
                    item[key] = ""

            if key == 'itemUrl':
                if item[key] is None:
                    item[key] = ""

                # --- REMOVE DUPLICATES ITEM_URLS --- TTR has the same urls with and without / under the /product page
                if item[key][:1] != '/':
                    item[key] = "/" + item[key]

                if item[key] not in self.parsedUrls:
                    self.parsedUrls[item[key]] = 1
                    # print("write!")
                else:
                    self.parsedUrls[item[key]] += 1
                    print("dupe found!!!!!" + item[key])
                    raise DropItem("Missing {0}!".format(item))

            if key == 'dateCrawled':
                item[key] = str(date.today())

        # if item already exist, dont write to db again -- 11/04/2020 (crawlCount is now useless)
        # itemAlreadyExist = self.db[self.mongodb_collection].find_one({"_id": item['itemUrl']})

        itemAlreadyExist = None
        print("writing to dbbbbb")
        if (itemAlreadyExist == None):
            # set itemUrl as primary key, update all other fields, increment crawlCount to track which are newly added -- 18/08/2019
            # self.db[self.mongodb_collection].update({'_id': item['itemUrl']}, {"$inc": {'crawlCount': 1}, "$set": dict(item)}, upsert=True)

            # use crawlcount as page number as ranking instead -- 11/04/2020
            self.db[self.mongodb_collection].update({'_id': item['itemUrl']},
                                                    {"$set": dict(item)},
                                                    upsert=True)
            # self.db[self.mongodb_collection].insert(dict(item))
            logging.info("Adding into MongoDB!")
        else:
            logging.info("Already exists in db")
        return item
Exemplo n.º 16
0
 def process_item(self, item, spider):
     if not item['amount']:
         raise DropItem("Missing rate in %s" % item)
     else:
         item['amount'] = item['amount'].strip('元')
     return item
Exemplo n.º 17
0
 def process_item(self, item, spider):
     result = self.db.insert(item)
     if not result:
         raise DropItem("CSV: %s is duplicated" % item['name'])
     return item
Exemplo n.º 18
0
 def process_item(self, item, spider):
     name = item['name'] + '--panda'
     item['name'] = name
     if name:
         raise DropItem("哈哈,这是一条信息,当name不为空的时候我就报错DropItem这一条item就会被抛弃")
     return item
Exemplo n.º 19
0
 def item_completed(self, results, item, info):
     image_path_list = [x['path'] for ok, x in results if ok]
     if not image_path_list:
         raise DropItem("Item contains no images")
     item['saveURL'] = image_path_list[0]
     return item
Exemplo n.º 20
0
 def item_completed(self, results, item, info):
     if not results[0][0]:
         raise DropItem('下载失败')
     return item
Exemplo n.º 21
0
 def process_item(self, item, spider):
     if item['Review'][0] in self.reviews_seen:
         raise DropItem("Repeated items found: %s" % item)
     else:
         self.reviews_seen.add(item['Review'][0])
         return item
Exemplo n.º 22
0
 def process_item(self, item, spider):
     if item['vaga'] is None:
         raise DropItem("item found: %s" % item)
     else:
         return item
Exemplo n.º 23
0
 def process_item(self, item, spider):
     if item.get("company_name"):
         return item
     raise DropItem("company_name is null")
Exemplo n.º 24
0
 def item_completed(self, results, item, info):
     image_path = [x['path'] for ok, x in results if ok]
     if not image_path:
         raise DropItem("图片下载失败")
     return item
Exemplo n.º 25
0
 def process_item(self, item, spider):
     if not isinstance(item, GraphImage):
         return item
     try:
         ret = self.coll.update({"_id": item["_id"]}, {
             "$setOnInsert": {
                 "_id": item["_id"],
                 "instagram_id": item["instagram_id"],
                 "owner_id": item["owner_id"],
                 "thumbnail_src": item["thumbnail_src"],
                 "thumbnail_resources": item["thumbnail_resources"],
                 "typename": item.get("typename"),
                 "is_video": item["is_video"],
                 "date": item["date"],
                 "display_src": item["display_src"],
                 "caption": item["caption"],
                 "download_urls": item["download_urls"],
                 "downloaded_img_info": item.get("downloaded_img_info"),
                 "status": 1,
                 "scraped_ts": int(time.time()),
             },
             "$set": {
                 "update_ts": int(time.time())
             },
             "$addToSet": {
                 "hashtags": {
                     "$each": item.get('hashtags', [])
                 }
             }
         },
                                upsert=True)
         if item["date"] > self.latest_downloaded_ts:
             self.latest_downloaded_ts = item["date"]
         if item["date"] < self.earliest_downloaded_ts:
             self.earliest_downloaded_ts = item["date"]
         if ret['updatedExisting']:
             logger.info('Updated graph images: %s', item["_id"])
             self.existed += 1
         else:
             filename = '{}.jl'.format(item["_id"])
             filename = os.path.join(self.export_filepath, filename)
             export_file = open(filename, 'wb')
             exportor = JsonLinesItemExporter(export_file)
             exportor.start_exporting()
             exportor.export_item(item)
             exportor.finish_exporting()
             logger.info('dumped item to file: %s', ret['upserted'])
             logger.info('Inserted graph images: %s', ret['upserted'])
             self.task.send_task('fetch_image', (item['_id'], ))
             logger.info('Send task fetch_image: %s', item['_id'])
             self.inserted += 1
     except RedisError:
         logger.error('Send task Failed. Network unreachable')
         raise DropItem('Send fetch_image task FAILED. DROP ITEM %s' %
                        item["_id"])
     except:
         logger.error('DB FAILED: %s', traceback.format_exc())
         raise DropItem('Save graph image to db FAILED. DROP ITEM %s' %
                        item["_id"])
     else:
         return item
Exemplo n.º 26
0
 def item_completed(self, results, item, info):
     image_paths = [x['path'] for ok, x in results if ok]
     if not image_paths:
         raise DropItem('Image Downloaded Failed')
     return item
Exemplo n.º 27
0
 def process_item(self, item, spider):
     if not all(item.values()):
         raise DropItem("Missing values!")
     else:
         return item
Exemplo n.º 28
0
 def item_completed(self, results, item, info):
     front_image_path = [x['path'] for ok, x in results if ok]
     if not front_image_path:
         raise DropItem("Item contains no images")
     item['front_image_path'] = front_image_path
     return item
Exemplo n.º 29
0
 def item_completed(self, results, item, info):
     image_path = [x['path'] for ok, x in results if ok]
     if not image_path:
         raise DropItem('Item contains no images')
     return item
Exemplo n.º 30
0
    def process_item(self, item, spider):
        if int(item['like']) < 1000:
            raise DropItem("Missing price in %s" % item)

        self.exporter.export_item(item)
        return item