Exemplo n.º 1
0
    def process_item(self, item, spider):
        """Add the PersonItem component and the list of ToolItem components to
        the database. Return the input item.
        If a database constraint is violated, rollback the transaction.

        Arguments:
            - item: dictionary {'person': PersonItem component,
                                'tools': list of ToolItem components}
            - spider: a spider instance (see scrapy docs)

        Returns:
            - item: dictionary {'person': PersonItem component,
                                'tools': list of ToolItem components}

        Note: the "rollback" behavior is currently untestable by SQLAlchemy.
        """
        person = Person(**item["person"])

        for tool_item in item["tools"]:
            tool = Tool(**tool_item)
            person.tools.append(tool)

        self.session.add(person)
        try:
            self.session.commit()
            sys.stderr.write(".")
        except IntegrityError:
            logger.warn('"%s" is already in database.', person.name)
            self.session.rollback()

        return item
Exemplo n.º 2
0
def validate_tool_items(items, person_item, verbose=False):
    name = person_item["name"]
    article_url = person_item["article_url"]

    if not items:
        logger.warn("%s (%s) doesn't use any tools.", name, article_url)
        return

    # Replace the contents of `items` list with only the items that are valid
    items[:] = [item for item in items if is_valid_tool(item, name, article_url, verbose)]

    if not items:
        logger.warn("%s doesn't use any tools that have valid URLs.", name)
Exemplo n.º 3
0
def validate_person_item(item, verbose=False):
    missing_fields = missing_item_fields(item)
    if missing_fields:
        raise ItemValidationError("PersonItem missing fields: {missing_fields}".format(**locals()))

    name = item["name"]
    article_url = item["article_url"]
    if not name:
        err_msg = "Interview at {article_url} doesn't have a person's name".format(**locals())
        raise ItemValidationError(err_msg)

    if not is_valid_url(article_url):
        err_msg = "{name} ({article_url}) doesn't have a valid interview URL".format(**locals())
        raise ItemValidationError(err_msg)

    pub_date = item["pub_date"]
    if not is_valid_date(pub_date):
        err_msg = "{name} ({article_url}) doesn't have a publication date.".format(**locals())
        raise ItemValidationError(err_msg)

    img_src = item["img_src"]
    if not is_valid_src(img_src):
        err_msg = "{name} ({article_url}) doesn't have a valid image source URL ({img_src}).".format(**locals())
        raise ItemValidationError(err_msg)

    if not item["bio"]:
        logger.warn("%s (%s) doesn't have a bio.", name, article_url)

    if not item["hardware"]:
        logger.warn("%s (%s) doesn't have a hardware section.", name, article_url)

    if not item["software"]:
        logger.warn("%s (%s) doesn't have a software section.", name, article_url)

    if not item["dream"]:
        logger.warn("%s (%s) doesn't have a dream-setup section.", name, article_url)