def test_progressbarsize():
    """Test the progressbarize function."""
    x = range(10)
    y = progressbarize(x)
    assert x == y

    z = progressbarize(x, progress=True)
    assert z
Exemplo n.º 2
0
def aggregate(input_keywords_file, no_synonyms=None, use_progressbar=False):  # pylint: disable=too-many-branches
    """Aggregate available topics.

    :param input_keywords_file: a list/tuple of input keywords files to process
    :param no_synonyms: do not compute synonyms for keywords
    :param use_progressbar: use progressbar to report progress
    :return:
    """
    if not input_keywords_file:
        raise ValueError('No input keywords files provided')

    all_keywords = {}
    for input_file in progressbarize(input_keywords_file or [],
                                     use_progressbar):
        input_content = anymarkup.parse_file(input_file)
        for keyword, value in input_content.items():
            keyword = str(keyword)

            if not KeywordsChief.matches_keyword_pattern(keyword):
                _logger.debug(
                    "Dropping keyword '%s' as it does not match keyword pattern.",
                    keyword)
                continue

            if keyword in all_keywords.keys(
            ) and value is not None and all_keywords[keyword] is not None:
                all_keywords[keyword]['occurrence_count'] = value.pop('occurrence_count', 0) +\
                                                            all_keywords[keyword].get('occurrence_count', 0)
                for conf, items in value.items():
                    all_keywords[keyword][str(conf)] = list(
                        set(items or [])
                        | set(all_keywords[keyword][str(conf)] or []))
            else:
                all_keywords[keyword] = value if value is not None else {}

            if not no_synonyms:
                synonyms = list(
                    set(all_keywords[keyword].get('synonyms') or [])
                    | set(KeywordsChief.compute_synonyms(keyword)))

                if synonyms:
                    if all_keywords[str(keyword)] is None:
                        all_keywords[str(keyword)] = {}
                    all_keywords[str(keyword)]['synonyms'] = synonyms

    # filter out keywords with low occurrence count
    if defaults.OCCURRENCE_COUNT_FILTER > 1:
        result = {}
        for keyword, value in all_keywords.items():
            if value.get('occurrence_count',
                         1) > defaults.OCCURRENCE_COUNT_FILTER:
                result[keyword] = value

        return result

    return all_keywords
Exemplo n.º 3
0
def lookup_file(path,
                keywords_file=None,
                stopwords_file=None,
                ignore_errors=False,
                ngram_size=None,
                use_progressbar=False,
                lemmatize=False,
                stemmer=None,
                scorer=None):
    # pylint: disable=too-many-arguments,too-many-locals
    """Perform keywords lookup on a file or directory tree of files.

    :param path: path of directory tree or file on which the lookup should be done
    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped
    :param ngram_size: size of ngrams, if None, ngram size is computed
    :param use_progressbar: True if progressbar should be shown
    :param lemmatize: use lemmatizer
    :type lemmatize: bool
    :param stemmer: stemmer to be used
    :type stemmer: str
    :param scorer: scorer to be used
    :type scorer: f8a_tagger.scoring.Scoring
    :return: found keywords, reported per file
    """
    ret = {}
    ngram_size, tokenizer, chief, core_parser = _prepare_lookup(
        keywords_file, stopwords_file, ngram_size, lemmatize, stemmer)
    for project, file in progressbarize(iter_files(path, ignore_errors),
                                        progress=use_progressbar):
        file_name = file
        if not isinstance(file, str):
            file_name = file.name
        _logger.info("Processing file '%s' for project '%s'", file_name,
                     project)
        try:
            content = core_parser.parse_file(file_name)
            keywords = _perform_lookup(content, tokenizer, chief, scorer)
        except Exception as exc:  # pylint: disable=broad-except
            if not ignore_errors:
                raise
            _logger.exception("Failed to parse content in file '%s': %s",
                              file_name, str(exc))
            continue
        finally:
            # Remove temporary file here so we can use safely progressbar
            if not isinstance(file, str):
                _logger.debug("Removing temporary file '%s' for project '%s'",
                              file_name, project)
                os.remove(file_name)

        ret[project] = keywords

    return ret
Exemplo n.º 4
0
    def execute(self, ignore_errors=True, use_progressbar=False):
        """Collect Maven keywords."""
        keywords_set = KeywordsSet()

        _logger.debug("Fetching Maven and executing Maven index checker")
        maven_index_checker_dir = get_files_dir()
        maven_index_checker_jar = path.join(maven_index_checker_dir,
                                            "maven-index-checker.jar")

        if not path.isfile(maven_index_checker_jar):
            raise InstallPrepareError(
                "Maven index checker was not found in '%s', did you forget "
                "to run prepare()?" % maven_index_checker_jar)

        with cwd(maven_index_checker_dir):
            # This requires at least  4GB of free space on /tmp partition
            packages = loads(
                check_output(['java', '-jar', maven_index_checker_jar, '-it']))

        for package in packages:
            del package['version']
        packages = [
            dict(s) for s in set(frozenset(d.items()) for d in packages)
        ]

        _logger.debug("started fetching data from mvnrepository.com")
        try:
            for package in progressbarize(packages, use_progressbar):
                package_name = package['groupId'] + '/' + package['artifactId']
                response = get(self._MVNREPOSITORY_URL + package_name)
                if response.ok is not True:
                    error_msg = "Failed to retrieve package information for '{}', " \
                                "response status code: {}". \
                        format(package_name, response.status_code)
                    if ignore_errors:
                        _logger.error(error_msg)
                        continue
                    raise RuntimeError(error_msg)

                soup = BeautifulSoup(response.text, 'lxml')
                for i in soup.find_all(class_="b tag"):
                    keywords_set.add(i.text)

                # It seems that mvnrepository has limit for 2000 requests per hour
                # so sleeping 2 seconds of sleep should do the trick
                sleep(2)
        finally:
            # Clean unpacked maven index after executing
            _logger.debug("Cleaning unpacked maven index")
            rmtree(path.join(maven_index_checker_dir, "target"))

        return keywords_set
Exemplo n.º 5
0
    def execute(self, ignore_errors=True, use_progressbar=False):
        """Collect PyPI keywords."""
        keywords_set = KeywordsSet()

        _logger.debug("Fetching PyPI")
        response = requests.get(self._PYPI_SIMPLE_URL)
        if response.status_code != 200:
            raise RuntimeError(
                "Failed to fetch '%s', request ended with status code %s" %
                (self._PYPI_SIMPLE_URL, response.status_code))

        soup = BeautifulSoup(response.text, 'lxml')
        for link in progressbarize(soup.find_all('a'), use_progressbar):
            package_name = link.text
            url = urljoin(self._PACKAGE_BASE_URL, package_name)
            response = requests.get(url)
            if response.status_code != 200:
                error_msg = "Failed to retrieve package information for '{}', " \
                            "response status code: {}".\
                    format(package_name, response.status_code)
                if ignore_errors:
                    _logger.error(error_msg)
                    continue
                raise RuntimeError(error_msg)

            package_soup = BeautifulSoup(response.text, 'lxml')
            # meta_keywords = package_soup.find_all('meta', attrs={'name': 'keywords'})
            meta_keywords = package_soup.find_all('p', attrs={'class': 'tags'})
            if len(meta_keywords) < 1:
                warn_msg = "Failed to parse and find keywords for '%s'" % package_name
                _logger.warning(warn_msg)
                continue

            # some packages have comma hardcoded in the keywords list, split keywords there as well
            found_keywords = []
            keywords_spans = meta_keywords[0].find_all(
                'span', attrs={'class': 'package-keyword'})
            for span in keywords_spans:
                for word in span.contents:
                    found_keywords += [
                        k.strip().lower() for k in word.split(',')
                        if k.strip() != ""
                    ]

            _logger.debug("Found keywords %s in '%s'", found_keywords,
                          package_name)

            for keyword in set(found_keywords):
                keywords_set.add(keyword)

        return keywords_set
Exemplo n.º 6
0
def lookup(path,
           keywords_file=None,
           stopwords_file=None,
           ignore_errors=False,
           ngram_size=None,
           use_progressbar=False,
           lemmatize=False,
           stemmer=None):
    # pylint: disable=too-many-arguments,too-many-locals
    """Perform keywords lookup.

    :param path: path of directory tree or file on which the lookup should be done
    :param keywords_file: keywords file to be used
    :param stopwords_file: stopwords file to be used
    :param ignore_errors: True, if errors should be reported but computation shouldn't be stopped
    :param ngram_size: size of ngrams, if None, ngram size is computed
    :param use_progressbar: True if progressbar should be shown
    :param lemmatize: use lemmatizer
    :type lemmatize: bool
    :param stemmer: stemmer to be used
    :type stemmer: str
    :return: found keywords, reported per file
    """
    ret = {}

    stemmer_instance = Stemmer.get_stemmer(
        stemmer) if stemmer is not None else None
    lemmatizer_instance = Lemmatizer.get_lemmatizer() if lemmatize else None

    chief = KeywordsChief(keywords_file,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)
    computed_ngram_size = chief.compute_ngram_size()
    if ngram_size is not None and computed_ngram_size > ngram_size:
        _logger.warning(
            "Computed ngram size (%d) does not reflect supplied ngram size (%d), "
            "some synonyms will be omitted", chief.compute_ngram_size(),
            ngram_size)
    elif ngram_size is None:
        ngram_size = computed_ngram_size

    tokenizer = Tokenizer(stopwords_file,
                          ngram_size,
                          lemmatizer=lemmatizer_instance,
                          stemmer=stemmer_instance)

    for file in progressbarize(iter_files(path, ignore_errors),
                               progress=use_progressbar):
        _logger.info("Processing file '%s'", file)
        try:
            content = CoreParser().parse_file(file)
            tokens = tokenizer.tokenize(content)
            # We do not perform any analysis on sentences now, so treat all tokens as one array (sentences of tokens).
            tokens = chain(*tokens)
            keywords = chief.extract_keywords(tokens)
        except Exception as exc:  # pylint: disable=broad-except
            if not ignore_errors:
                raise
            _logger.exception("Failed to parse content in file '%s': %s", file,
                              str(exc))
            continue

        ret[file] = keywords

    return ret