Python extract 예제들, extruct.extract Python 예제들

예제 #1

0

파일 보기

파일: _schemaorg.py 프로젝트: raman325/recipe-scrapers

    def __init__(self, url, test=False):
        self.format = None
        self.testing_mode = test
        self.data = {}
        self.format = None

        if test:  # when testing, we load a file
            with url:
                r = url.read()
                data = extruct.extract(
                    r,
                    base_url=
                    "https://www.allrecipes.com/recipe/133948/four-cheese-margherita-pizza/",
                    syntaxes=SYNTAXES,
                    uniform=True,
                )
        else:
            r = requests.get(url, headers=HEADERS)
            data = extruct.extract(
                r.text,
                base_url=get_base_url(r.text, r.url),
                syntaxes=SYNTAXES,
                uniform=True,
            )

        for syntax in SYNTAXES:
            for item in data.get(syntax, []):
                if ("@context" in item and item["@context"] == SCHEMA_ORG_HOST
                        and "@type" in item
                        and item["@type"].lower() == SCHEMA_NAME.lower()):
                    self.format = syntax
                    self.data = item
                    return

예제 #2

0

파일 보기

def get_article_content(url):
    if extruct.extract(get_html(url)).get('microdata'):
        metadata = extruct.extract(get_html(url)).get('microdata')
        return parse_microdata(metadata)
    elif extruct.extract(get_html(url)).get('json-ld'):
        metadata = extruct.extract(get_html(url)).get('json-ld')
        return parse_json_ld(metadata)

예제 #3

0

파일 보기

def load(fp: Union[str, IO[str]],
         python_objects: bool = False,
         nonstandard_attrs: bool = False,
         migrate_old_schema: bool = True) -> List[Dict]:
    """load a filename or file object to scrape

    Parameters
    ----------
    fp : string or file-like object
        A file name or a file-like object.

    python_object : bool, optional
        when True it translates some data types into python objects
        dates into datetime.date, datetimes into datetime.datetimes,
        durations as dateime.timedelta.  (defaults to False)

    nonstandard_attrs : bool, optional
        when True it adds nonstandard (for schema.org/Recipe) attributes to the
        resulting dictionaries, that are outside the specification such as:
            '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML)
            '_source_url' is the source url, when 'url' has already been defined as another value
        (defaults to False)

    migrate_old_schema : boool, optional
        when True it migrates the schema from older version to current version
        (defaults to True)

    Returns
    -------
    list
        a list of dictionaries in the style of schema.org/Recipe JSON-LD
        no results - an empty list will be returned

    """

    data = {}  # type: Dict[str, List[Dict]]

    if isinstance(fp, str):
        with open(fp) as f:
            data = extruct.extract(f.read())
    elif hasattr(fp, 'read'):
        # Assume this is some kind of file-like object that can be read.
        data = extruct.extract(fp.read())
    else:
        raise TypeError('expected, fp to be a filename or a file-like object, '
                        'fp is of type {}'.format(type(fp)))

    scrapings = _convert_to_scrapings(data, nonstandard_attrs)

    if migrate_old_schema is True:
        scrapings = _migrate_old_schema(scrapings)

    if python_objects is True:
        scrapings = _pythonize_objects(scrapings)

    return scrapings

예제 #4

0

파일 보기

파일: test_extruct.py 프로젝트: tongsong91/extruct

    def test_errors(self):
        body = ''

        # raise exceptions
        with pytest.raises(Exception):
            data = extruct.extract(body)

        # ignore exceptions
        expected = {}
        data = extruct.extract(body, errors='ignore')
        assert data == expected

        # ignore exceptions
        data = extruct.extract(body, errors='log')
        assert data == expected

예제 #5

0

파일 보기

파일: link_utils.py 프로젝트: dashdrum/bulkmarks

def get_json_ld_description(url):

	description = None
	error_code = None

	## TODO: Get detail on the errors

	try:
		r = requests.get(url, timeout=5)
	except:
		error_code = '404'
		return description, error_code

	try:
		base_url = get_base_url(r.text, r.url)
	except:
		error_code = '500'
		return description, error_code

	try:
		data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) #
	except Exception as e:
		error_code = '500'
		return description, error_code

	jl = data['json-ld']

	for l in jl:
		if l.get('description',None):
			description = l['description']

	return description, '200'

예제 #6

0

파일 보기

def resolve_jsonld(html):
  from extruct import extract
  ld = extract(html, syntaxes=['json-ld', 'microdata', 'opengraph', 'rdfa'], uniform=True)
  # TODO: deal with other types that aren't json-ld
  #  by converting them.
  logging.info('!', ld)
  return ld

예제 #7

0

파일 보기

파일: scrap_movie_details.py 프로젝트: MrBkumar/DigiQTtask

def get_metadata(html: bytes, url: str):
    """Fetch JSON-LD structured data."""
    metadata = extruct.extract(html,
                               base_url=get_base_url(url),
                               syntaxes=['json-ld'],
                               uniform=True)['json-ld']
    return metadata

예제 #8

0

파일 보기

파일: schema_crawler.py 프로젝트: stanford-oval/SPL

def extract(html, url):

    # data = BeautifulSoup(html, 'html.parser')
    data = extruct.extract(
        html,
        url,
        syntaxes=['microdata', 'json-ld', 'rdfa', 'microformat'],
        errors='log')
    schemas = []
    if data.get('json-ld'):
        for schema in data['json-ld']:
            if schema.get('@type', None):
                # print('*****' + schema['@type'])
                type = schema_pattern_match(schema['@type'])
                if type:
                    schema['@type'] = type
                    schemas.append(schema)
    if data.get('microdata'):
        for schema in data['microdata']:
            # print('*****' + schema['type'])
            type = schema_pattern_match(schema['type'])
            if type:
                schema['type'] = type
                schemas.append(schema)
    if data.get('microformat'):
        for schema in data['microformat']:
            # print('*****' + schema['type'])
            type = schema_pattern_match(schema['type'])
            if type:
                schema['type'] = type
                schemas.append(schema)

    return schemas

예제 #9

0

파일 보기

파일: scraper_rotten_extruct.py 프로젝트: pablodanielrey/twss

def get_jsons(url):
    r = requests.get(url)
    base_url = get_base_url(r.text, r.url)
    data = extruct.extract(r.text, base_url=base_url)
    if 'json-ld' not in data:
        raise Exception('No se encuentran datos json-ld')
    return data['json-ld']

예제 #10

0

파일 보기

파일: test_extruct.py 프로젝트: tongsong91/extruct

 def test_deprecated_url(self):
     body, expected = self._microdata_custom_url('product_custom_url.json')
     with pytest.warns(DeprecationWarning):
         data = extruct.extract(body,
                                url='http://some-example.com',
                                syntaxes=['microdata'])
     self.assertEqual(data, expected)

예제 #11

0

파일 보기

def get_recipe_data(url):
    def _find_recipe(c):
        if isinstance(c, dict):
            if "@type" in c.keys() and c["@type"] == "Recipe":
                return c
            for i in c.values():
                res = _find_recipe(i)
                if res:
                    return res
        if isinstance(c, list):
            for i in c:
                res = _find_recipe(i)
                if res:
                    return res
        return []

    html = requests.get(url, headers=HEADERS, cookies=COOKIES)
    data_list = extract(html.text, uniform=True)

    recipe_data = _find_recipe(data_list)
    if not recipe_data:
        raise MissingSchema(
            "Website does not provide a schema.org Recipe schema in a json-ld format"
        )
    return recipe_data

예제 #12

0

파일 보기

 def test_umicrodata(self):
     expected = [{
         "@context": "http://schema.org",
         "@type": "Product",
         "brand": "ACME",
         "name": "Executive Anvil",
         "image": "anvil_executive.jpg",
         "description":
         "Sleeker than ACME's Classic Anvil, the Executive Anvil is perfect for the business traveler looking for something to drop from a height.",
         "mpn": "925872",
         "aggregateRating": {
             "@type": "AggregateRating",
             "ratingValue": "4.4",
             "reviewCount": "89"
         },
         "offers": {
             "@type": "Offer",
             "priceCurrency": "USD",
             "price": "119.99",
             "priceValidUntil": "2020-11-05",
             "seller": {
                 "@type": "Organization",
                 "name": "Executive Objects"
             },
             "itemCondition": "http://schema.org/UsedCondition",
             "availability": "http://schema.org/InStock"
         }
     }]
     body = get_testdata('misc', 'product_microdata.html')
     data = extruct.extract(body, syntaxes=['microdata'], uniform=True)
     self.assertEqual(data['microdata'], expected)

예제 #13

0

파일 보기

파일: microdata_middleware.py 프로젝트: hoaxly/hoaxly_helpers

    def process_spider_output(self, response, result, spider):
        """get all metadata and add them as fields to item"""
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s - %(levelname)s - %(message)s')
        for scraped_item in result:
            if isinstance(scraped_item, Request):
                # yield the request without making changes

                yield scraped_item
            else:
                # if this is an item inspect for microdata
                data = extruct.extract(
                    response.body,
                    response.url,
                    syntaxes=['microdata', 'opengraph', 'rdfa', 'json-ld'],
                    uniform=True)

                logging.debug(response.url)
                scraped_item['url'] = response.url
                # scraped_item['source_spider'] = spider.name

                if not data:
                    # if no microdata was found set flag and yield the item
                    logging.debug('this item has no microdata')
                    scraped_item['hasMetaData'] = False
                else:
                    scraped_item['hasMetaData'] = True
                    scraped_item['microdatas'] = data
                yield scraped_item

예제 #14

0

파일 보기

 def test_uopengraph(self):
     expected = [{
         "@context": {
             "og": "http://ogp.me/ns#",
             "fb": "http://www.facebook.com/2008/fbml",
             "concerts": "http://ogp.me/ns/fb/songkick-concerts#"
         },
         "fb:app_id":
         "308540029359",
         "og:site_name":
         "Songkick",
         "@type":
         "songkick-concerts:artist",
         "og:title":
         "Elysian Fields",
         "og:description":
         "Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.",
         "og:url":
         "http://www.songkick.com/artists/236156-elysian-fields",
         "og:image":
         "http://images.sk-static.com/images/media/img/col4/20100330-103600-169450.jpg",
     }]
     body = get_testdata('songkick', 'elysianfields.html')
     data = extruct.extract(body, syntaxes=['opengraph'], uniform=True)
     self.assertEqual(data['opengraph'], expected)

예제 #15

0

파일 보기

파일: mymiddleware.py 프로젝트: hoaxly/hoaxly-scraping-container

    def process_spider_output(self, response, result, spider):
        """get all metadata and add them as fields to item"""
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s - %(levelname)s - %(message)s')
        for x in result:
            if isinstance(x, Request):
                # yield the request without making changes
                #logging.debug('this is a request and not an item')
                yield x
            else:
                # if this is an item inspect for microdata
                data = extruct.extract(response.body, response.url)
                logging.debug(response.url)
                x['url'] = response.url

                if not data:
                    # if no microdata was found set flag and yield the item
                    logging.debug('this item has no microdata')
                    x['hasMetaData'] = False
                else:
                    # if there is microdata parse it to item
                    x['hasMetaData'] = True
                    logging.debug('data is an object of type')
                    logging.debug(type(data))
                    logging.debug('parsing microdata fields')

                    for micodatatype, microdatafields in data.items():
                        logging.debug(micodatatype)
                        logging.debug(microdatafields)
                        x[micodatatype] = microdatafields

                yield x

예제 #16

0

파일 보기

파일: claimReview1-spider.py 프로젝트: Zoher15/Zoher-Python-Scripts

 def parse(self, response):
     if (len(self.claimdf) == 100):
         raise CloseSpider('100 crawled')
     data = extruct.extract(response.text, response.url)['microdata']
     selected = [
         properties for properties in data
         if properties['type'] == 'http://schema.org/ClaimReview'
     ]
     for elements in selected:
         print "#######################################################################"
         print "#######################################################################"
         print "#######################################################################"
         print "#######################################################################"
         print "#######################################################################"
         print "YOOOOOOOOOOOOOOOOO"
         dictt = elements['properties']
         for key in dictt:
             if type(dictt[key]) == list:
                 dictt[key] = dictt[key][0]
         self.claimdf = self.claimdf.append(
             pd.io.json.json_normalize(dictt), ignore_index=True)
         self.claimdf.to_csv('metro.se.csv', encoding='utf-8')
         print "#######################################################################"
         print "#######################################################################"
         print "#######################################################################"
         print "#######################################################################"
         print "#######################################################################"

예제 #17

0

파일 보기

파일: link_utils.py 프로젝트: dashdrum/bulkmarks

def get_opengraph_description(url):

	description = None
	error_code = None

	## TODO: Get detail on the errors

	try:
		r = requests.get(url, timeout=5)
	except:
		error_code = '404'
		return description, error_code

	try:
		base_url = get_base_url(r.text, r.url)
	except:
		error_code = '500'
		return description, error_code

	try:
		data = extruct.extract(r.text, base_url=base_url,syntaxes=['opengraph']) #
	except Exception as e:
		error_code = '500'
		return description, error_code

	og = data['opengraph']

	for o in og:
		if o.get('properties',None):
			for i in o['properties']:
				if i[0] == 'og:description':
					description = i[1]

	return description, '200'

예제 #18

0

파일 보기

파일: scraper.py 프로젝트: ronaldaoH/scraper-casa-del-libro

def extraer_informacion_web():
    pp = pprint.PrettyPrinter(indent=2)
    r = requests.get(
        'https://www.casadellibro.com/libro-los-renglones-torcidos-de-hollywood/9788412094749/11187413'
    )

    base_url = get_base_url(r.text, r.url)
    data = extruct.extract(r.text, base_url=base_url)

    schema = data['json-ld']

    soup = BeautifulSoup(r.text, 'lxml')

    imagen = soup.find('img', {'class': 'product-image'})

    descripcion = soup.find('div', {'class': 'hidden-sm-and-down'})
    desc = descripcion.text

    desc = desc.replace("Ver más", '').strip()

    if "CRÍTICAS" in desc:
        desc = desc.split("CRÍTICAS")
        desc = desc[0]
    if "Resumen" in desc:
        desc = desc.split("Resumen")
        desc = desc[1]
    obj = {}
    obj['desc'] = desc
    obj['imagen'] = imagen['data-src']
    obj['schema'] = schema
    print(obj)
    print(type(obj))

예제 #19

0

파일 보기

파일: link_utils.py 프로젝트: dashdrum/bulkmarks

def get_json_ld_headline(url):

	headline = None
	error_code = None

	## TODO: Get detail on the errors

	try:
		r = requests.get(url, timeout=5)
	except:
		error_code = '404'
		return headline, error_code

	try:
		base_url = get_base_url(r.text, r.url)
	except:
		error_code = '500'
		return headline, error_code

	try:
		data = extruct.extract(r.text, base_url=base_url,syntaxes=['json-ld']) #
	except Exception as e:
		error_code = '500'
		return headline, error_code

	jl = data['json-ld']

	for l in jl:
		if l.get('headline',None):
			headline = l['headline']
			headline = (headline[:197] + '...') if len(headline) > 197 else headline


	return headline, '200'

예제 #20

0

파일 보기

파일: datacommons_factcheck.py 프로젝트: Institute-Web-Science-and-Technologies/misinformation_datasets_processing

def load_jsonld():
    # read the file
    with open(source_file_path) as f:
        content = f.read()

    # extract the embedded metadata https://github.com/scrapinghub/extruct
    data = extruct.extract(content)

    claimReviews = data['json-ld']

    # some analysis of the labels to see how they are annotated
    labels = set([el['reviewRating']['alternateName'] for el in claimReviews])
    lambda_source = lambda el: el['author']['name']

    # group the labels by the author of the review, to see how each one of them uses the alternateName
    labels_by_sources = {
        k: set([el['reviewRating']['alternateName'] for el in v])
        for k, v in itertools.groupby(sorted(claimReviews, key=lambda_source),
                                      key=lambda_source)
    }

    print('#claimReviews', len(claimReviews))
    print('#labels', len(labels))
    #print('labels', labels)
    print('#label for each source',
          {k: len(v)
           for k, v in labels_by_sources.items()})

    # save the original claimReviews
    utils.write_json_with_path(claimReviews, intermediate_path,
                               'datacommons_claimReviews.json')

    return claimReviews

예제 #21

0

파일 보기

파일: scrapers.py 프로젝트: ATawzer/Formulated-Flavors

    def recursive_page_scrape(self, page):
        """
        Crawls across website and scrapes recipes as discovered.
        The navigation is recursive, scraping each page it comes upon and all the links
        on that page.
        """
        # Mark as being read
        print_message = f"Recipes Found: {len(self.scraped_recipes)} | Scraping {page}"
        print(print_message.ljust(200, " "), end="\r", flush=True)
        self.link_library[page] = 1
        self.sdb.update_one({"_id": self.source},
                            {"$set": {
                                "link_library": self.link_library
                            }})

        # Scrape it, if it errors out it won't be tried again
        r = requests.get(page, headers=self.headers)
        soup = BeautifulSoup(r.content, "html.parser")
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url)

        # Recipe
        self.scrape_recipe(data, page)

        # Look for all links on page
        for link_string in soup.findAll(
                'a', attrs={'href': re.compile("^https://")}):
            link = clean_url(link_string.get('href'), self.utm_pages)
            if check_link(link, self.domain):
                if link not in self.link_library.keys():
                    wait()
                    self.recursive_page_scrape(link)

예제 #22

0

파일 보기

파일: __init__.py 프로젝트: priya-gitTest/FAIRshake

  def perform(kls, inputs):
    urls = inputs['target:url']
    for url in urls:
      try:
        r = requests.get(url)
        base_url = get_base_url(r.text, r.url)
        data = extruct.extract(r.text, base_url=base_url, syntaxes=['json-ld'])['json-ld']
        tree = Tree(data)
        break
      except:
        data = None

    return dict(
      **{
        'metric:30': {
          'answer': 1.0 if data else 0.0,
          'comment': 'jsonld was found and properly parsed' if data else 'jsonld could not be parsed',
        },
      },
      **{
        key: {
          'answer': 1.0 if attr else 0.0,
          'comment': attr if attr else 'json-ld %s not found' % (' '.join(to_schema[key])),
        } if key.startswith('metric:') else attr
        for key, attr in zip(
          to_schema.keys(),
          map(
            bind(get_json_ld_attr, tree),
            to_schema.values()
          )
        )
      } if data else {key: {} for key in to_schema.keys()}
    )

예제 #23

0

파일 보기

파일: _schemaorg.py 프로젝트: mattsarn/recipe-scrapers

    def __init__(self, page_data, host):
        self.format = None
        self.host = host
        self.data = {}

        data = extruct.extract(
            page_data,
            syntaxes=SYNTAXES,
            uniform=True,
        )

        for syntax in SYNTAXES:
            for item in data.get(syntax, []):
                in_context = SCHEMA_ORG_HOST in item.get("@context", "")
                low_schema = [s.lower() for s in SCHEMA_NAMES]
                if in_context and item.get("@type", "").lower() in low_schema:
                    self.format = syntax
                    self.data = item
                    if item.get("@type").lower() == 'webpage':
                        self.data = self.data.get('mainEntity')
                    return
                elif in_context and "@graph" in item:
                    for graph_item in item.get("@graph", ""):
                        in_graph = SCHEMA_ORG_HOST in graph_item.get(
                            "@context", "")
                        if in_graph and graph_item.get(
                                "@type", "").lower() in low_schema:
                            self.format = syntax
                            self.data = graph_item
                            if graph_item.get("@type").lower() == 'webpage':
                                self.data = self.data.get('mainEntity')
                            return

예제 #24

0

파일 보기

def basic_recipe_from_opengraph(html: str, url: str) -> dict:
    base_url = get_base_url(html, url)
    data = extruct.extract(html, base_url=base_url)
    try:
        properties = data["opengraph"][0]["properties"]
    except:
        return

    return {
        "name": og_field(properties, "og:title"),
        "description": og_field(properties, "og:description"),
        "image": og_field(properties, "og:image"),
        "recipeYield": "",
        # FIXME: If recipeIngredient is an empty list, mongodb's data verification fails.
        "recipeIngredient": ["Could not detect ingredients"],
        # FIXME: recipeInstructions is allowed to be empty but message this is added for user sanity.
        "recipeInstructions": [{"text": "Could not detect instructions"}],
        "slug": slugify(og_field(properties, "og:title")),
        "orgURL": og_field(properties, "og:url"),
        "categories": [],
        "tags": og_fields(properties, "og:article:tag"),
        "dateAdded": None,
        "notes": [],
        "extras": [],
    }

예제 #25

0

파일 보기

    def __init__(self, page_data):
        self.format = None
        self.data = {}

        data = extruct.extract(page_data, syntaxes=SYNTAXES, uniform=True)

        low_schema = {s.lower() for s in SCHEMA_NAMES}
        for syntax in SYNTAXES:
            for item in data.get(syntax, []):
                in_context = SCHEMA_ORG_HOST in item.get("@context", "")
                if in_context and item.get("@type", "").lower() in low_schema:
                    self.format = syntax
                    self.data = item
                    if item.get("@type").lower() == "webpage":
                        self.data = self.data.get("mainEntity")
                    return
                elif in_context and "@graph" in item:
                    for graph_item in item.get("@graph", ""):
                        graph_item_type = graph_item.get("@type", "")
                        if not isinstance(graph_item_type, str):
                            continue
                        if graph_item_type.lower() in low_schema:
                            in_graph = SCHEMA_ORG_HOST in graph_item.get(
                                "@context", "")
                            self.format = syntax
                            if graph_item_type.lower(
                            ) == "webpage" and in_graph:
                                self.data = self.data.get("mainEntity")
                                return
                            elif graph_item_type.lower() == "recipe":
                                self.data = graph_item
                                return

예제 #26

0

파일 보기

파일: rdf_webfinder.py 프로젝트: IFB-ElixirFr/interop-wg

def rdfFromHTML(html, base_url):
    data = extruct.extract(html, syntaxes=['microdata', 'rdfa', 'json-ld'], errors='ignore')


    print("URL: [" + base_url + "]")
    # print(html)
    # print(json.dumps(data, indent=2))
    for type_key in data.keys():
        # print(type_key)


        # list of namespaces
        ns_list = getRdfNameSpaces(data[type_key])

        if data[type_key] != []:
            print(type_key + " Found !")



        # if data[type_key] != []:
        if not base_url in URL_WITH_RDF.keys():
            URL_WITH_RDF[base_url] = {}
        URL_WITH_RDF[base_url][type_key] = ns_list
    print("")

    return data

예제 #27

0

파일 보기

파일: spill_quote.py 프로젝트: pawelmhm/wiki_helpers

def main(url):
    with open('cytuj_strone.mustache') as template_file:
        template = template_file.read()

    headers = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
    }
    res = requests.get(url, headers=headers)
    data = extruct.extract(res.text, res.url)
    print(json.dumps(data))
    # return
    dd = {}

    if 'json-ld' in data and len(data.get('json-ld', [])) != 0:
        dd = parse_ld(data)
    elif 'microdata' in data:
        dd = parse_micro(data)

    dzisiaj = datetime.datetime.now().isoformat()[:10]
    dd['data_d'] = dzisiaj
    if not dd.get('url'):
        dd['url'] = url
    msg = "{{" + pystache.render(template, dd) + "}}"
    print(msg)

예제 #28

0

파일 보기

    def extract_rdf_extruct(self, url) -> ConjunctiveGraph:
        while True:
            try:
                response = requests.get(url=url, timeout=10)
                break
            except SSLError:
                time.sleep(5)
            except requests.exceptions.Timeout:
                print("Timeout, retrying")
                time.sleep(5)
            except requests.exceptions.ConnectionError as e:
                print(e)
                print("ConnectionError, retrying...")
                time.sleep(10)

        self.status_code = response.status_code
        html_source = response.content

        data = extruct.extract(
            html_source, syntaxes=["microdata", "rdfa", "json-ld"], errors="ignore"
        )

        kg = ConjunctiveGraph()

        base_path = Path(__file__).parent.parent  # current directory
        static_file_path = str((base_path / "static/data/jsonldcontext.json").resolve())

        if "json-ld" in data.keys():
            for md in data["json-ld"]:
                if "@context" in md.keys():
                    if ("https://schema.org" in md["@context"]) or (
                        "http://schema.org" in md["@context"]
                    ):
                        md["@context"] = static_file_path
                kg.parse(data=json.dumps(md, ensure_ascii=False), format="json-ld")
        if "rdfa" in data.keys():
            for md in data["rdfa"]:
                if "@context" in md.keys():
                    if ("https://schema.org" in md["@context"]) or (
                        "http://schema.org" in md["@context"]
                    ):
                        md["@context"] = static_file_path
                kg.parse(data=json.dumps(md, ensure_ascii=False), format="json-ld")

        if "microdata" in data.keys():
            for md in data["microdata"]:
                if "@context" in md.keys():
                    if ("https://schema.org" in md["@context"]) or (
                        "http://schema.org" in md["@context"]
                    ):
                        md["@context"] = static_file_path
                kg.parse(data=json.dumps(md, ensure_ascii=False), format="json-ld")

        logging.debug(kg.serialize(format="turtle"))

        kg.namespace_manager.bind("sc", URIRef("http://schema.org/"))
        kg.namespace_manager.bind("bsc", URIRef("https://bioschemas.org/"))
        kg.namespace_manager.bind("dct", URIRef("http://purl.org/dc/terms/"))

        return kg

예제 #29

0

파일 보기

파일: parser.py 프로젝트: Pavaev/simple_parser

 def _get_extracted_data(self):
     extracted_data = extruct.extract(self.html,
                                      self.url,
                                      errors='ignore',
                                      syntaxes=['microdata'],
                                      uniform=True)
     return extracted_data['microdata']

예제 #30

0

파일 보기

파일: test_extruct.py 프로젝트: abaveja97/extruct

 def test_all(self):
     body = get_testdata('songkick', 'elysianfields.html')
     expected = json.loads(
         get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
     data = extruct.extract(
         body,
         base_url='http://www.songkick.com/artists/236156-elysian-fields')
     self.assertEqual(jsonize_dict(data), expected)

예제 #31

0

파일 보기

파일: ucd_json_fetcher.py 프로젝트: ucldc/harvester

    def _dochits_to_objset(self, docHits):
        '''Returns list of objects.
        '''

        objset = []
        for d in docHits:
            r = requests.get(d.text)

            # get JSON-LD from the page
            base_url = get_base_url(r.text)
            data = extruct.extract(r.text, base_url)
            jsld = data.get('json-ld')[0]

            obj = {}
            obj_mdata = defaultdict(list)
            for mdata in jsld:
                obj_mdata[mdata] = jsld[mdata]
            obj['metadata'] = dict(obj_mdata)
            objset.append(obj)
            self.docs_fetched += 1
        return objset