def probable_languages(self, text, max_languages=3): """List of most probable programming languages, the list is ordered from the most probable to the less probable. :param str text: source code. :param int max_languages: maximum number of listed languages. :return: languages list :rtype: list """ values = extract(text) input_fn = _to_func([[values], []]) proba = next(self._classifier.predict_proba(input_fn=input_fn)) proba = proba.tolist() # Order the languages from the most probable to the least probable positions = np.argsort(proba)[::-1] names = np.sort(list(self.languages)) names = names[positions] # Find the most distant consecutive languages: # A logarithmic scale is used here because the probabilities here # are most of the time really close to zero proba = np.log(proba) proba = np.sort(proba)[::-1] distances = [ proba[pos] - proba[pos + 1] for pos in range(len(proba) - 1) ] max_distance_pos = 1 + np.argmax(distances) # Keep the languages that are close to the most probable one nb_languages = min(max_languages, max_distance_pos) return names[:nb_languages]
def _extract_features(path, rank_map): ext = path.suffix.lstrip('.') rank = rank_map.get(ext) if rank is None: raise GuesslangError('Language not found for ext: {}'.format(ext)) content = safe_read_file(path) content = '\n'.join(content.splitlines()[:_NB_LINES]) return [extract(content), rank]
def test_extract(): text = """ int * last(int *tab, int size) { \treturn tab + (size - 1); } """ tokens = ['int', '*', 'last', '(', 'tab', ',', 'size', ')', '\n', '{'] values = extractor.extract(text) assert values text_indices = non_empty_indices(values) for token in tokens: token_values = extractor.extract(token) token_indices = non_empty_indices(token_values) assert len(token_indices) == 1 assert text_indices.issuperset(token_indices)
def _extract_features(path: Path, rank_map: Dict[str, int]) -> Tuple[List[float], int]: ext = path.suffix.lstrip('.') rank = rank_map.get(ext) if rank is None: raise GuesslangError('Language not found for ext: {}'.format(ext)) content = safe_read_file(path) content = '\n'.join(content.splitlines()[:NB_LINES]) return (extract(content), rank)
def language_name(self, text): """Returns the predicted programming language name. ``text`` -- source code. """ values = extract(text) input_fn = _to_func([[values], []]) pos = next(self._classifier.predict_classes(input_fn=input_fn)) LOGGER.debug("Predicted language position %s", pos) return sorted(self.languages)[pos]
def language_name(self, text: str) -> str: """Predict the programming language name of the given source code. :param text: source code. :return: language name """ values = extract(text) input_fn = _to_func(([values], [])) pos: int = next(self._classifier.predict_classes(input_fn=input_fn)) LOGGER.debug("Predicted language position %s", pos) return sorted(self.languages)[pos]
def scores(self, text: str) -> Dict[str, float]: """A score for each language corresponding to the probability that the text is written in the given language. The score is a `float` value between 0.0 and 1.0 :param text: source code. :return: language to score dictionary """ values = extract(text) input_fn = _to_func(([values], [])) prediction = self._classifier.predict_proba(input_fn=input_fn) probabilities = next(prediction).tolist() sorted_languages = sorted(self.languages) return dict(zip(sorted_languages, probabilities))
def probable_languages(self, text): """Returns the list of most probable programming languages, the list is ordered from the most probable to the less probable. ``text`` -- source code. """ values = extract(text) input_fn = _to_func([[values], []]) proba = next(self._classifier.predict_proba(input_fn=input_fn)) proba = proba.tolist() threshold = max(proba) - _K_STDEV * stdev(proba) items = sorted(enumerate(proba), key=itemgetter(1), reverse=True) LOGGER.debug("Threshold: %f, probabilities: %s", threshold, items) positions = [pos for pos, value in items if value > threshold] LOGGER.debug("Predicted languages positions %s", positions) names = sorted(self.languages) return [names[pos] for pos in positions]