def concept_assoc(terms): terms = terms.split(',') result = [] for term in terms: uri = concept_uri('en', term) result.append((uri, 1.0)) return result
def concepts(terms): terms = terms.split(',') result = [] for term in terms: uri = concept_uri('en', term) result.append(uri) return result
def standardized_concept_uri(lang, text, *more): """ Make the appropriate URI for a concept in a particular language, including stemming the text if necessary, normalizing it, and joining it into a concept URI. Items in 'more' will not be stemmed, but will go through the other normalization steps. >>> standardized_concept_uri('en', 'this is a test') '/c/en/this_is_test' >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase') '/c/en/this_is_test/n/example_phrase' """ if lang == 'en': token_filter = english_filter else: token_filter = None lang = lang.lower() if lang in LCODE_ALIASES: lang = LCODE_ALIASES[lang] norm_text = standardize_text(text, token_filter) more_text = [standardize_text(item, token_filter) for item in more if item is not None] return concept_uri(lang, norm_text, *more_text)
def standardized_concept_uri(lang, text, *more): """ Make the appropriate URI for a concept in a particular language, including stemming the text if necessary, normalizing it, and joining it into a concept URI. Items in 'more' will not be stemmed, but will go through the other normalization steps. >>> standardized_concept_uri('en', 'this is a test') '/c/en/this_is_test' >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase') '/c/en/this_is_test/n/example_phrase' """ lang = lang.lower() if lang in LCODE_ALIASES: lang = LCODE_ALIASES[lang] if lang == 'en': token_filter = english_filter else: token_filter = None norm_text = standardize_text(text, token_filter) more_text = [standardize_text(item, token_filter) for item in more if item is not None] return concept_uri(lang, norm_text, *more_text)
def concepts(terms): terms = terms.split(",") result = [] for term in terms: uri = concept_uri("en", term) result.append(uri) return result
def make_assoc(terms): terms = terms.split(",") result = [] for term in terms: uri = concept_uri("en", term) result.append((uri, 1.0)) return result
def make_term(): """ Make and return a string representing a (synthetic) term in a randomly- chosen language. """ global _term_count language = random_gen.choice(LANGUAGES, p=LANGUAGE_PROBA) term_text = 'term{}'.format(_term_count) term = concept_uri(language, term_text) _term_count += 1 return term
def standardized_concept_uri(lang, text, *more): """ Make the appropriate URI for a concept in a particular language, including removing English stopwords, normalizing the text in a way appropriate to that language (using the text normalization from wordfreq), and joining its tokens with underscores in a concept URI. This text normalization can smooth over some writing differences: for example, it removes vowel points from Arabic words, and it transliterates Serbian written in the Cyrillic alphabet to the Latin alphabet so that it can match other words written in Latin letters. 'more' contains information to distinguish word senses, such as a part of speech or a WordNet domain. The items in 'more' get lowercased and joined with underscores, but skip many of the other steps -- for example, they won't have stopwords removed. >>> standardized_concept_uri('en', 'this is a test') '/c/en/this_is_test' >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase') '/c/en/this_is_test/n/example_phrase' >>> standardized_concept_uri('sh', 'симетрија') '/c/sh/simetrija' """ lang = lang.lower() if lang in LCODE_ALIASES: lang = LCODE_ALIASES[lang] if lang == 'en': token_filter = english_filter else: token_filter = None text = preprocess_text(text.replace('_', ' '), lang) tokens = simple_tokenize(text) if token_filter is not None: tokens = token_filter(tokens) norm_text = '_'.join(tokens) more_text = [] for item in more: if item is not None: tokens = simple_tokenize(item.replace('_', ' ')) if token_filter is not None: tokens = token_filter(tokens) more_text.append('_'.join(tokens)) return concept_uri(lang, norm_text, *more_text)
def normalized_concept_uri(lang, text, *more): """ Make the appropriate URI for a concept in a particular language, including stemming the text if necessary, normalizing it, and joining it into a concept URI. Items in 'more' will not be stemmed, but will go through the other normalization steps. >>> normalized_concept_uri('en', 'this is a test') '/c/en/this_be_test' >>> normalized_concept_uri('en', 'this is a test', 'n', 'example phrase') '/c/en/this_be_test/n/example_phrase' """ norm_text = normalized_concept_name(lang, text) more_text = [normalize_text(item) for item in more] return concept_uri(lang, norm_text, *more_text)
def get_concept_uri(text, *more): norm_text = concept_name(text) more_text = [english_standardize_text(item) for item in more if item is not None] return concept_uri('en', norm_text, *more_text)