Python StringComparator示例，flexget.utils.search.StringComparator Python示例

示例#1

0

显示文件

文件： discover.py 项目： Anaerin/Flexget

    def execute_searches(self, config, entries):
        """
        :param config: Discover plugin config
        :param entries: List of pseudo entries to search
        :return: List of entries found from search engines listed under `from` configuration
        """

        result = []
        if config.get('type', 'normal') == 'normal':
            comparator = StringComparator(cutoff=0.7, cleaner=clean_title)
        elif config['type'] == 'exact':
            comparator = StringComparator(cutoff=0.9)
        elif config['type'] == 'any':
            comparator = AnyComparator()
        else:
            comparator = MovieComparator()
        for item in config['from']:
            if isinstance(item, dict):
                plugin_name, plugin_config = item.items()[0]
            else:
                plugin_name, plugin_config = item, None
            search = get_plugin_by_name(plugin_name).instance
            if not callable(getattr(search, 'search')):
                log.critical('Search plugin %s does not implement search method' % plugin_name)
            for entry in entries:
                try:
                    search_results = search.search(entry['title'], comparator, plugin_config)
                    log.debug('Discovered %s entries from %s' % (len(search_results), plugin_name))
                    result.extend(search_results[:config.get('limit')])
                except (PluginError, PluginWarning):
                    log.debug('No results from %s' % plugin_name)
        return sorted(result, reverse=True, key=lambda x: x.get('search_sort'))

示例#2

0

显示文件

文件： urlrewrite_torrentz.py 项目： Anaerin/Flexget

    def search(self, query, comparator=StringComparator(), config=None):
        if config:
            feed = REPUTATIONS[config]
        else:
            feed = REPUTATIONS['good']
        # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand
        comparator.set_seq1(query)
        query = comparator.search_string()
        url = 'http://torrentz.eu/%s?q=%s' % (
            feed, urllib.quote(query.encode('utf-8')))
        log.debug('requesting: %s' % url)
        rss = feedparser.parse(url)
        entries = []

        status = rss.get('status', False)
        if status != 200:
            raise PluginWarning('Search result not 200 (OK), received %s' %
                                status)

        ex = rss.get('bozo_exception', False)
        if ex:
            raise PluginWarning('Got bozo_exception (bad feed)')

        for item in rss.entries:
            # assign confidence score of how close this link is to the name you're looking for. .6 and above is "close"
            comparator.set_seq2(item.title)
            log.debug('name: %s' % comparator.a)
            log.debug('found name: %s' % comparator.b)
            log.debug('confidence: %s' % comparator.ratio())
            if not comparator.matches():
                continue

            m = re.search(
                r'Size: ([\d]+) Mb Seeds: ([,\d]+) Peers: ([,\d]+) Hash: ([a-f0-9]+)',
                item.description, re.IGNORECASE)
            if not m:
                log.debug('regexp did not find seeds / peer data')
                continue

            entry = Entry()
            entry['title'] = item.title
            entry['url'] = item.link
            entry['content_size'] = int(m.group(1))
            entry['torrent_seeds'] = int(m.group(2).replace(',', ''))
            entry['torrent_leeches'] = int(m.group(3).replace(',', ''))
            entry['torrent_info_hash'] = m.group(4).upper()
            entry['search_ratio'] = comparator.ratio()
            entry['search_sort'] = torrent_availability(
                entry['torrent_seeds'], entry['torrent_leeches'])
            entries.append(entry)

        # choose torrent
        if not entries:
            raise PluginWarning('No close matches for %s' % query,
                                log,
                                log_once=True)

        entries.sort(reverse=True, key=lambda x: x.get('search_sort'))
        log.debug('Search got %d results' % len(entries))
        return entries

示例#3

0

显示文件

文件： urlrewrite_search.py 项目： s-m-b/Flexget

    def on_task_urlrewrite(self, task, config):
        # no searches in unit test mode
        if task.manager.unit_test:
            return

        plugins = {}
        for plugin in get_plugins_by_group('search'):
            plugins[plugin.name] = plugin.instance

        # search accepted
        for entry in task.accepted:
            found = False
            # loop through configured searches
            for name in config:
                search_config = None
                if isinstance(name, dict):
                    # assume the name is the first/only key in the dict.
                    name, search_config = name.items()[0]
                log.verbose('Searching `%s` from %s' % (entry['title'], name))
                try:
                    results = plugins[name].search(
                        entry['title'], StringComparator(cutoff=0.9),
                        search_config)
                    if results:
                        url = results[0]['url']
                        log.debug('Found url: %s' % url)
                        entry['url'] = url
                        found = True
                        break
                except (PluginError, PluginWarning), pw:
                    log.verbose('Failed: %s' % pw.value)
                    continue

            # Search failed
            if not found:
                # If I don't have a URL, doesn't matter if I'm immortal...
                entry['immortal'] = False
                task.reject(entry, 'search failed')

示例#4

0

显示文件

    def search(self, query, comparator=StringComparator(), config=None):
        """
        Search for name from piratebay.
        """
        if not isinstance(config, dict):
            config = {}
        sort = SORT.get(config.get('sort_by', 'seeds'))
        if config.get('sort_reverse'):
            sort += 1
        if isinstance(config.get('category'), int):
            category = config['category']
        else:
            category = CATEGORIES.get(config.get('category', 'all'))
        filter_url = '/0/%d/%d' % (sort, category)

        comparator.set_seq1(query)
        query = comparator.search_string()
        # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand
        url = 'http://thepiratebay.se/search/' + urllib.quote(
            query.encode('utf-8')) + filter_url
        log.debug('Using %s as piratebay search url' % url)
        page = requests.get(url).content
        soup = get_soup(page)
        entries = []
        for link in soup.find_all('a', attrs={'class': 'detLink'}):
            comparator.set_seq2(link.contents[0])
            log.debug('name: %s' % comparator.a)
            log.debug('found name: %s' % comparator.b)
            log.debug('confidence: %s' % comparator.ratio())
            if not comparator.matches():
                continue
            entry = Entry()
            entry['title'] = link.contents[0]
            entry['url'] = 'http://thepiratebay.se' + link.get('href')
            tds = link.parent.parent.parent.find_all('td')
            entry['torrent_seeds'] = int(tds[-2].contents[0])
            entry['torrent_leeches'] = int(tds[-1].contents[0])
            entry['search_ratio'] = comparator.ratio()
            entry['search_sort'] = torrent_availability(
                entry['torrent_seeds'], entry['torrent_leeches'])
            # Parse content_size
            size = link.find_next(attrs={'class': 'detDesc'}).contents[0]
            size = re.search('Size ([\.\d]+)\xa0([GMK])iB', size)
            if size:
                if size.group(2) == 'G':
                    entry['content_size'] = int(
                        float(size.group(1)) * 1000**3 / 1024**2)
                elif size.group(2) == 'M':
                    entry['content_size'] = int(
                        float(size.group(1)) * 1000**2 / 1024**2)
                else:
                    entry['content_size'] = int(
                        float(size.group(1)) * 1000 / 1024**2)
            entries.append(entry)

        if not entries:
            dashindex = query.rfind('-')
            if dashindex != -1:
                return self.search(query[:dashindex], comparator=comparator)
            else:
                raise PluginWarning('No close matches for %s' % query,
                                    log,
                                    log_once=True)

        entries.sort(reverse=True, key=lambda x: x.get('search_sort'))

        return entries

示例#5

0

显示文件

文件： urlrewrite_newtorrents.py 项目： s-m-b/Flexget

    def entries_from_search(self,
                            name,
                            url=None,
                            comparator=StringComparator(cutoff=0.9)):
        """Parses torrent download url from search results"""
        comparator.set_seq1(name)
        name = comparator.search_string()
        if not url:
            url = 'http://www.newtorrents.info/search/%s' % urllib.quote(
                name.encode('utf-8'), safe=':/~?=&%')

        log.debug('search url: %s' % url)

        html = urlopener(url, log).read()
        # fix </SCR'+'IPT> so that BS does not crash
        # TODO: should use beautifulsoup massage
        html = re.sub(r'(</SCR.*?)...(.*?IPT>)', r'\1\2', html)

        soup = get_soup(html)
        # saving torrents in dict
        torrents = []
        for link in soup.find_all('a', attrs={'href': re.compile('down.php')}):
            torrent_url = 'http://www.newtorrents.info%s' % link.get('href')
            release_name = link.parent.next.get('title')
            # quick dirty hack
            seed = link.find_next('td', attrs={
                'class': re.compile('s')
            }).renderContents()
            if seed == 'n/a':
                seed = 0
            else:
                try:
                    seed = int(seed)
                except ValueError:
                    log.warning(
                        'Error converting seed value (%s) from newtorrents to integer.'
                        % seed)
                    seed = 0

            #TODO: also parse content_size and peers from results
            if comparator.matches(release_name):
                torrents.append(
                    Entry(title=release_name,
                          url=torrent_url,
                          torrent_seeds=seed,
                          search_ratio=comparator.ratio(),
                          search_sort=torrent_availability(seed, 0)))
            else:
                log.debug('rejecting search result: %s !~ %s' %
                          (release_name, name))
        # sort with seed number Reverse order
        torrents.sort(reverse=True, key=lambda x: x.get('search_sort', 0))
        # choose the torrent
        if not torrents:
            dashindex = name.rfind('-')
            if dashindex != -1:
                return self.entries_from_search(name[:dashindex],
                                                comparator=comparator)
            else:
                raise PluginWarning('No matches for %s' % name,
                                    log,
                                    log_once=True)
        else:
            if len(torrents) == 1:
                log.debug('found only one matching search result.')
            else:
                log.debug(
                    'search result contains multiple matches, sorted %s by most seeders'
                    % torrents)
            return torrents

示例#6

0

显示文件

文件： urlrewrite_torrentleech.py 项目： gazpachoking/Flexget

    def search(self, query, comparator=StringComparator(), config=None):
        """
        Search for name from torrentleech.
        """
        rss_key = config['rss_key']

        # build the form request:
        data = {
            'username': config['username'],
            'password': config['password'],
            'remember_me': 'on',
            'submit': 'submit'
        }
        # POST the login form:
        login = requests.post('http://torrentleech.org/', data=data)

        if not isinstance(config, dict):
            config = {}
        # sort = SORT.get(config.get('sort_by', 'seeds'))
        # if config.get('sort_reverse'):
        # sort += 1
        if isinstance(config.get('category'), int):
            category = config['category']
        else:
            category = CATEGORIES.get(config.get('category', 'all'))
        filter_url = '/categories/%d' % (category)

        comparator.set_seq1(query)
        query = comparator.search_string()
        # urllib.quote will crash if the unicode string has non ascii characters, so encode in utf-8 beforehand
        url = 'http://torrentleech.org/torrents/browse/index/query/' + urllib.quote(
            query.encode('utf-8')) + filter_url
        log.debug('Using %s as torrentleech search url' % url)

        page = requests.get(url, cookies=login.cookies).content
        soup = get_soup(page)

        entries = []
        for tr in soup.find_all("tr", ["even", "odd"]):
            # within each even or odd row, find the torrent names
            link = tr.find("a", attrs={'href': re.compile('/torrent/\d+')})
            log.debug('link phase: %s' % link.contents[0])
            # extracts the contents of the <a>titlename/<a> tag
            comparator.set_seq2(link.contents[0])
            log.debug('name: %s' % comparator.a)
            log.debug('found name: %s' % comparator.b)
            log.debug('confidence: %s' % comparator.ratio())
            if not comparator.matches():
                continue
            entry = Entry()
            entry['title'] = link.contents[0]

            # find download link
            torrent_url = tr.find("a",
                                  attrs={
                                      'href': re.compile('/download/\d+/.*')
                                  }).get('href')
            # parse link and split along /download/12345 and /name.torrent
            download_url = re.search('(/download/\d+)/(.+\.torrent)',
                                     torrent_url)
            # change link to rss and splice in rss_key
            torrent_url = 'http://torrentleech.org/rss' + download_url.group(
                1) + '/' + rss_key + '/' + download_url.group(2)
            log.debug('RSS-ified download link: %s' % torrent_url)
            entry['url'] = torrent_url

            # us tr object for seeders/leechers
            seeders, leechers = tr.find_all('td', ["seeders", "leechers"])
            entry['torrent_seeds'] = int(seeders.contents[0])
            entry['torrent_leeches'] = int(leechers.contents[0])
            entry['search_ratio'] = comparator.ratio()
            entry['search_sort'] = torrent_availability(
                entry['torrent_seeds'], entry['torrent_leeches'])

            # use tr object for size
            size = tr.find("td",
                           text=re.compile('([\.\d]+) ([GMK])B')).contents[0]
            size = re.search('([\.\d]+) ([GMK])B', size)
            if size:
                if size.group(2) == 'G':
                    entry['content_size'] = int(
                        float(size.group(1)) * 1000**3 / 1024**2)
                elif size.group(2) == 'M':
                    entry['content_size'] = int(
                        float(size.group(1)) * 1000**2 / 1024**2)
                else:
                    entry['content_size'] = int(
                        float(size.group(1)) * 1000 / 1024**2)
            entries.append(entry)

        if not entries:
            dashindex = query.rfind('-')
            if dashindex != -1:
                return self.search(query[:dashindex], comparator=comparator)
            else:
                raise PluginWarning('No close matches for %s' % query,
                                    log,
                                    log_once=True)

        entries.sort(reverse=True, key=lambda x: x.get('search_sort'))

        return entries