def bigrams(self): """Return a dictionary { ("w1", "w2"): NUMBER_OF_OCCURENCES, ... }""" result = {} for i in range(len(self.tokens()) - 1): bigram = (self.tokens()[i], self.tokens()[i + 1]) sparse_dict_increment(result, bigram) return result
def quantify(self, analysis): """Quantify usage of cohesive markers.""" result = {} if analysis.lang == 'en': tokenized_markers = [(marker, nltk.word_tokenize(marker)) for marker in COHESIVE_MARKERS] text = analysis.tokens() for i, _ in enumerate(text): for (marker, tokenized) in tokenized_markers: if (tokenized == text[i:i + len(tokenized)]): sparse_dict_increment(result, marker) elif analysis.lang == 'zh': if self.k == 0: markers = COHESIVE_MARKERS_ZH elif self.k == 1: markers = COHESIVE_MARKERS_CHEN_2006 elif self.k == 2: markers = ["即", "也就是说"] elif self.k == 3: # top 5 from Chen's list, using GainRatio from weka. markers = ["但是", "因为", "据说", "那么", "如果"] elif self.k == 4: # adversative markers 转折词 markers = ADVERSATIVES text = analysis.tokens() for i, _ in enumerate(text): for marker in markers: # marker = '不过' # marker can be segmented to at most len(marker) parts # for j in range(1, len(marker)+1): # But we assume marker can be segmented to at most 3 parts for j in range(1, min(len(marker) + 1, 4)): if (marker == ''.join(text[i:i + j])): sparse_dict_increment(result, marker) ''' # old assert len(COHESIVE_MARKERS_ZH) == len(COHESIVE_MARKERS_ZH_SEG) # [('不过', ['不过']), ... ] tokenized_markers = \ [(COHESIVE_MARKERS_ZH[i], COHESIVE_MARKERS_ZH_SEG[i]) \ for i in range(len(COHESIVE_MARKERS_ZH))] for i, _ in enumerate(text): for (marker, tokenized) in tokenized_markers: if (tokenized == text[i:i + len(tokenized)]): sparse_dict_increment(result, marker) ''' else: print('language "{}" not implemented yet for cohesive_markers'. format(analysis.lang)) exit() pairs = [(marker, float(result[marker]) / len(text)) for marker in result.keys()] return dict(pairs)
def __add_token_edges(self, token): if len(token) < self.k: return word_start = WORD_START + token[0:self.k] word_end = token[-self.k:] + WORD_END for key in word_start, word_end: sparse_dict_increment(self.histogram, key)
def histogram(self): """Return a histogram of tokens in the text. >>> Analysis("Hello, hello world.").histogram() {'world': 1, '.': 1, 'hello': 2, ',': 1} """ result = {} for t in self.tokens(): sparse_dict_increment(result, t) return result
def quantify_variant(analysis, variant): n = variant + 1 d = {} all_pos_tags = [ pos for (_, pos) in analysis.pos_tags() ] for ngram in ingrams(all_pos_tags, n): sparse_dict_increment(d, ngram) return {output_filter_ngram(k): v for (k, v) in d.items()}
def bigrams(self): """Returns a histogram of bigrams in the text. >>> Analysis("Hello hello hello world").bigrams() {('hello', 'world'): 1, ('hello', 'hello'): 2} """ result = {} for i in range(len(self.tokens()) - 1): bigram = (self.tokens()[i], self.tokens()[i + 1]) sparse_dict_increment(result, bigram) return result
def quantify(analysis): d = {} word_stream = (function_word_or_POS(token, tag) for (token, tag) in analysis.pos_tags()) num_tokens = float(len(analysis.pos_tags())) for trigram in nltk.util.itrigrams(word_stream): if trigram_is_functional(trigram): sparse_dict_increment(d, trigram) return {output_filter_ngram(k): (v / num_tokens) for (k, v) in d.items()}
def quantify_variant(analysis, variant): """Quantify word n-grams""" n = variant + 1 d = {} all_words = [word for (word, _) in analysis.pos_tags()] for ngram in ngrams(all_words, n): sparse_dict_increment(d, ngram) return {output_filter_ngram(k): v for (k, v) in d.items()} # unnormalized counts
def quantify(analysis): result = {} tokenized_markers = [(marker,nltk.word_tokenize(marker)) for marker in COHESIVE_MARKERS] text = analysis.tokens() for i, _ in enumerate(text): for (marker,tokenized) in tokenized_markers: if (tokenized == text[i:i+len(tokenized)]): sparse_dict_increment(result, marker) pairs = [ (marker, float(result[marker]) / len(text)) for marker in result.keys()] return dict(pairs)
def quantify(analysis): assert isinstance(analysis, translationese.Analysis) result = {} for sentence in analysis.tokenized_sentences(): if len(sentence) < 6: # Sentence has fewer than 5 tokens (and a period) continue for position_name, position in POSITION_NAMES.items(): key = "%s %s" % (position_name, sentence[position]) sparse_dict_increment(result, key) return result
def quantify(analysis): """Quantify usage of cohesive markers.""" result = {} tokenized_markers = [(marker,nltk.word_tokenize(marker)) for marker in COHESIVE_MARKERS] text = analysis.tokens() for i, _ in enumerate(text): for (marker,tokenized) in tokenized_markers: if (tokenized == text[i:i+len(tokenized)]): sparse_dict_increment(result, marker) pairs = [ (marker, float(result[marker]) / len(text)) for marker in result.keys()] return dict(pairs)
def quantify(analysis): """Quantify contextual function words.""" if analysis.lang == 'en': from translationese.function_words import FUNCTION_WORDS elif analysis.lang == 'zh': from translationese.function_words import FUNCTION_WORDS_ZH as FUNCTION_WORDS else: print('language "{}" not implemented yet for contextual_function_words'.format(analysis.lang)) exit() d = {} word_stream = (function_word_or_POS(token, tag, FUNCTION_WORDS) for (token, tag) in analysis.pos_tags()) num_tokens = float(len(analysis.pos_tags())) for trigram in nltk.trigrams(word_stream): if trigram_is_functional(trigram, FUNCTION_WORDS): sparse_dict_increment(d, trigram) return {output_filter_ngram(k): (v / num_tokens) for (k, v) in d.items()}
def quantify_variant(analysis, variant): """Quantify POS n-grams""" if variant <= 2: n = variant + 1 d = {} all_pos_tags = [pos for (_, pos) in analysis.pos_tags()] for ngram in ngrams(all_pos_tags, n): sparse_dict_increment(d, ngram) return {output_filter_ngram(k): v for (k, v) in d.items()} # unnormalized counts elif variant == 3: n = 3 d = {} all_pos_tags = [pos for (_, pos) in analysis.pos_tags()] for ngram in ngrams(all_pos_tags, n): sparse_dict_increment(d, ngram) return {output_filter_ngram(k): v for (k, v) in d.items() \ if output_filter_ngram(k) in MY_POS} # unnormalized counts
def __add_token_ngrams(self, token): for current_ngram in ingrams(token, self.k + 1): sparse_dict_increment(self.histogram, ''.join(current_ngram))
def histogram(self): """Return a dictionary { "TOKEN": NUMBER_OF_OCCURENCES, ... }""" result = {} for t in self.tokens(): sparse_dict_increment(result, t) return result