Пример #1
0
    def probable_languages(self, text, max_languages=3):
        """List of most probable programming languages,
        the list is ordered from the most probable to the less probable.

        :param str text: source code.
        :param int max_languages: maximum number of listed languages.
        :return: languages list
        :rtype: list
        """
        values = extract(text)
        input_fn = _to_func([[values], []])
        proba = next(self._classifier.predict_proba(input_fn=input_fn))
        proba = proba.tolist()

        # Order the languages from the most probable to the least probable
        positions = np.argsort(proba)[::-1]
        names = np.sort(list(self.languages))
        names = names[positions]

        # Find the most distant consecutive languages:
        # A logarithmic scale is used here because the probabilities here
        # are most of the time really close to zero
        proba = np.log(proba)
        proba = np.sort(proba)[::-1]
        distances = [
            proba[pos] - proba[pos + 1] for pos in range(len(proba) - 1)
        ]
        max_distance_pos = 1 + np.argmax(distances)

        # Keep the languages that are close to the most probable one
        nb_languages = min(max_languages, max_distance_pos)
        return names[:nb_languages]
Пример #2
0
def _extract_features(path, rank_map):
    ext = path.suffix.lstrip('.')
    rank = rank_map.get(ext)
    if rank is None:
        raise GuesslangError('Language not found for ext: {}'.format(ext))

    content = safe_read_file(path)
    content = '\n'.join(content.splitlines()[:_NB_LINES])
    return [extract(content), rank]
def test_extract():
    text = """
        int * last(int *tab, int size)
        {
        \treturn tab + (size - 1);
        }
    """
    tokens = ['int', '*', 'last', '(', 'tab', ',', 'size', ')', '\n', '{']

    values = extractor.extract(text)
    assert values

    text_indices = non_empty_indices(values)
    for token in tokens:
        token_values = extractor.extract(token)
        token_indices = non_empty_indices(token_values)

        assert len(token_indices) == 1
        assert text_indices.issuperset(token_indices)
Пример #4
0
def _extract_features(path: Path,
                      rank_map: Dict[str, int]) -> Tuple[List[float], int]:
    ext = path.suffix.lstrip('.')
    rank = rank_map.get(ext)
    if rank is None:
        raise GuesslangError('Language not found for ext: {}'.format(ext))

    content = safe_read_file(path)
    content = '\n'.join(content.splitlines()[:NB_LINES])
    return (extract(content), rank)
Пример #5
0
    def language_name(self, text):
        """Returns the predicted programming language name.

        ``text`` -- source code.

        """
        values = extract(text)
        input_fn = _to_func([[values], []])
        pos = next(self._classifier.predict_classes(input_fn=input_fn))

        LOGGER.debug("Predicted language position %s", pos)
        return sorted(self.languages)[pos]
Пример #6
0
    def language_name(self, text: str) -> str:
        """Predict the programming language name of the given source code.

        :param text: source code.
        :return: language name
        """
        values = extract(text)
        input_fn = _to_func(([values], []))
        pos: int = next(self._classifier.predict_classes(input_fn=input_fn))

        LOGGER.debug("Predicted language position %s", pos)
        return sorted(self.languages)[pos]
Пример #7
0
    def scores(self, text: str) -> Dict[str, float]:
        """A score for each language corresponding to the probability that
        the text is written in the given language.
        The score is a `float` value between 0.0 and 1.0

        :param text: source code.
        :return: language to score dictionary
        """
        values = extract(text)
        input_fn = _to_func(([values], []))
        prediction = self._classifier.predict_proba(input_fn=input_fn)
        probabilities = next(prediction).tolist()
        sorted_languages = sorted(self.languages)
        return dict(zip(sorted_languages, probabilities))
Пример #8
0
    def probable_languages(self, text):
        """Returns the list of most probable programming languages,
        the list is ordered from the most probable to the less probable.

        ``text`` -- source code.

        """
        values = extract(text)
        input_fn = _to_func([[values], []])
        proba = next(self._classifier.predict_proba(input_fn=input_fn))
        proba = proba.tolist()
        threshold = max(proba) - _K_STDEV * stdev(proba)

        items = sorted(enumerate(proba), key=itemgetter(1), reverse=True)
        LOGGER.debug("Threshold: %f, probabilities: %s", threshold, items)

        positions = [pos for pos, value in items if value > threshold]
        LOGGER.debug("Predicted languages positions %s", positions)

        names = sorted(self.languages)
        return [names[pos] for pos in positions]