Python clean 예제들, soweego.commons.url_utils.clean Python 예제들

예제 #1

0

파일 보기

    def _get_urls_for_entity_id(dump_path: str, l_path: str,
                                resolve: bool) -> dict:
        """given a l_{something}_url relationship file, return a dict of
        somethingid-[urls]"""

        LOGGER.info(f"Loading %s relationships", l_path)

        urlid_entityid_relationship = {}

        with open(l_path, "r") as tsvfile:
            url_relationships = DictReader(tsvfile,
                                           delimiter='\t',
                                           fieldnames=[i for i in range(0, 6)])

            for relationship in tqdm(url_relationships,
                                     total=count_num_lines_in_file(tsvfile)):
                # url id matched with its user id
                if relationship[3] in urlid_entityid_relationship:
                    LOGGER.warning(
                        'Url with ID %s has multiple entities, only one will '
                        'be stored',
                        relationship[3],
                    )
                else:
                    urlid_entityid_relationship[
                        relationship[3]] = relationship[2]

        url_path = os.path.join(dump_path, 'mbdump', 'url')
        url_entityid = {}

        LOGGER.info('Checking URLs related to entity')

        # Translates URL IDs to the relative URL
        with open(url_path, "r") as tsvfile:

            urls = DictReader(tsvfile,
                              delimiter='\t',
                              fieldnames=[i for i in range(0, 5)])

            for url_record in tqdm(urls,
                                   total=count_num_lines_in_file(tsvfile)):

                urlid = url_record[0]
                if urlid in urlid_entityid_relationship:
                    for candidate_url in url_utils.clean(url_record[2]):
                        if not url_utils.validate(candidate_url):
                            continue
                        if resolve and not url_utils.resolve(candidate_url):
                            continue
                        url_entityid[
                            candidate_url] = urlid_entityid_relationship[urlid]
                        del urlid_entityid_relationship[urlid]

        entityid_url = defaultdict(list)
        # Inverts dictionary
        for url, entityid in url_entityid.items():
            entityid_url[entityid].append(url)

        return entityid_url

예제 #2

0

파일 보기

    def _isni_link_generator(self, dump_path):
        isni_file_path = os.path.join(dump_path, 'mbdump', 'artist_isni')

        artist_link = {}

        done = False
        for result in external_id_pids_and_urls_query():
            if done:
                break
            for pid, formatter in result.items():
                if pid == 'P213':
                    for url_formatter, regex in formatter.items():
                        r = re.compile(regex)

                        with open(isni_file_path, 'r') as artistfile:
                            for artistid_isni in DictReader(artistfile, delimiter='\t', fieldnames=['id', 'isni']):
                                # If ISNI is valid, generates an url for the artist
                                artistid = artistid_isni['id']
                                isni = artistid_isni['isni']

                                link = url_formatter.replace(
                                    '$1', isni)
                                for candidate_url in url_utils.clean(link):
                                    if not url_utils.validate(candidate_url):
                                        continue
                                    if not url_utils.resolve(candidate_url):
                                        continue
                                    artist_link[artistid] = candidate_url
                    done = True

        artist_path = os.path.join(dump_path, 'mbdump', 'artist')
        with open(artist_path, 'r') as artistfile:
            for artist in DictReader(artistfile, delimiter='\t', fieldnames=['id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id']):
                try:
                    # Checks if artist has isni
                    link = artist_link[artist['id']]
                    if self._check_person(artist['type_id']):
                        current_entity = MusicbrainzArtistLinkEntity()
                        self._fill_link_entity(
                            current_entity, artist['gid'], link)
                        yield current_entity
                    if self._check_band(artist['type_id']):
                        current_entity = MusicbrainzBandLinkEntity()
                        self._fill_link_entity(
                            current_entity, artist['gid'], link)
                        yield current_entity
                except KeyError:
                    continue

예제 #3

0

파일 보기

파일: discogs_dump_extractor.py 프로젝트: Wikidata/soweego

 def _check_link(self, link, resolve: bool):
     LOGGER.debug('Processing link <%s>', link)
     clean_parts = url_utils.clean(link)
     LOGGER.debug('Clean link: %s', clean_parts)
     for part in clean_parts:
         valid = url_utils.validate(part)
         if not valid:
             self.dead_links += 1
             continue
         LOGGER.debug('Valid URL: <%s>', valid)
         if not resolve:
             yield valid
             continue
         alive = url_utils.resolve(valid)
         if not alive:
             self.dead_links += 1
             continue
         LOGGER.debug('Living URL: <%s>', alive)
         self.valid_links += 1
         yield alive

예제 #4

0

파일 보기

    def _link_generator(self, dump_path):
        l_artist_url_path = os.path.join(dump_path, 'mbdump', 'l_artist_url')

        # Loads all the relationships between URL ID and ARTIST ID
        urlid_artistid_relationship = {}

        with open(l_artist_url_path, "r") as tsvfile:
            url_relationships = DictReader(tsvfile,
                                           delimiter='\t',
                                           fieldnames=[i for i in range(0, 6)])
            for relationship in url_relationships:
                # url id matched with its user id
                if relationship[3] in urlid_artistid_relationship:
                    LOGGER.warning(
                        'Url with ID %s has multiple artists, only one will be stored' % relationship[3])
                else:
                    urlid_artistid_relationship[relationship[3]
                                                ] = relationship[2]

        url_artistid = {}
        url_path = os.path.join(dump_path, 'mbdump', 'url')
        # Translates URL IDs to the relative URL
        with open(url_path, "r") as tsvfile:
            urls = DictReader(tsvfile,
                              delimiter='\t',
                              fieldnames=[i for i in range(0, 5)])
            for url_record in urls:
                urlid = url_record[0]
                if urlid in urlid_artistid_relationship:
                    for candidate_url in url_utils.clean(url_record[2]):
                        if not url_utils.validate(candidate_url):
                            continue
                        if not url_utils.resolve(candidate_url):
                            continue
                        url_artistid[candidate_url] = urlid_artistid_relationship[urlid]
                        del urlid_artistid_relationship[urlid]

        urlid_artistid_relationship = None

        artistid_url = defaultdict(list)
        # Inverts dictionary
        for url, artistid in url_artistid.items():
            artistid_url[artistid].append(url)

        url_artistid = None
        # Translates ARTIST ID to the relative ARTIST
        artist_path = os.path.join(dump_path, 'mbdump', 'artist')
        with open(artist_path, 'r') as artistfile:
            for artist in DictReader(artistfile, delimiter='\t', fieldnames=['id', 'gid', 'label', 'sort_label', 'b_year', 'b_month', 'b_day', 'd_year', 'd_month', 'd_day', 'type_id']):
                if artist['id'] in artistid_url:
                    for link in artistid_url[artist['id']]:
                        if self._check_person(artist['type_id']):
                            current_entity = MusicbrainzArtistLinkEntity()
                            self._fill_link_entity(
                                current_entity, artist['gid'], link)
                            yield current_entity
                        if self._check_band(artist['type_id']):
                            current_entity = MusicbrainzBandLinkEntity()
                            self._fill_link_entity(
                                current_entity, artist['gid'], link)
                            yield current_entity