def __init__(self, passage, lemma_passage,name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ # print(passage) passage = passage.replace("\n", " ") sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') try: sentences = sent_detector.tokenize(passage.strip()) except: passage=passage.decode('utf-8') sentences = sent_detector.tokenize(passage.strip()) lemma_sentences=sent_detector.tokenize(lemma_passage.strip()) tokens = nltk.word_tokenize(lemma_passage) if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens self.sentences=sentences self.lemma_sentences=lemma_sentences if name: self.name = name elif ']' in tokens[:20]: end = tokens[:20].index(']') self.name = " ".join(text_type(tok) for tok in tokens[1:end]) else: self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
def __init__(self, passage, lemma_passage, name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ # print(passage) passage = passage.replace("\n", " ") sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') try: sentences = sent_detector.tokenize(passage.strip()) except: passage = passage.decode('utf-8') sentences = sent_detector.tokenize(passage.strip()) lemma_sentences = sent_detector.tokenize(lemma_passage.strip()) tokens = nltk.word_tokenize(lemma_passage) if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens self.sentences = sentences self.lemma_sentences = lemma_sentences if name: self.name = name elif ']' in tokens[:20]: end = tokens[:20].index(']') self.name = " ".join(text_type(tok) for tok in tokens[1:end]) else: self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append(repr(elt)) length += len(pieces[-1]) + 2 if length > self._MAX_REPR_SIZE and len(pieces) > 2: return '[%s, ...]' % text_type(', ').join(pieces[:-1]) else: return '[%s]' % text_type(', ').join(pieces)
def plot_word_discontinuous_distortion(): ''' word discontinuous distortion :return: ''' phrase_discont_distance_lr_l = pickle.load(open('word_discont_distance_lr_l.p','rb')) phrase_discont_distance_lr_r = pickle.load(open('word_discont_distance_lr_r.p','rb')) phrase_discont_distance_rl_l = pickle.load(open('word_discont_distance_rl_l.p','rb')) phrase_discont_distance_rl_r = pickle.load(open('word_discont_distance_rl_r.p','rb')) means = [np.mean(phrase_discont_distance_lr_l), np.mean(phrase_discont_distance_lr_r), np.mean(phrase_discont_distance_rl_l), np.mean(phrase_discont_distance_rl_r)] stds = [np.std(phrase_discont_distance_lr_l), np.std(phrase_discont_distance_lr_r), np.std(phrase_discont_distance_rl_l), np.std(phrase_discont_distance_rl_r)] print 'Means: ', means print 'Stds: ', stds labels = ['Left2Right\nLeft', 'Left2Right\nRight', 'Right2Left\nLeft', 'Right2Left\nRight'] plt.errorbar(np.array(range(len(means))) + .9, means, stds, marker='o', linestyle='None', \ ecolor='#5f9ed1', mfc='#5f9ed1', mec='None', label='Young') # plt.axhline(y=0, color='grey', linestyle='--', alpha=0.5) plt.xticks(np.array(range(len(means))) + .9, [compat.text_type(s.replace(' ', '\n')) for s in labels]) plt.yticks(range(1,21), [str(i) for i in range(1,21)]) plt.xlabel('Word discontinuous events') plt.ylabel('Distortion') plt.ylim((1,20)) plt.savefig('word_discont_dist', dpi=100, bbox_inches='tight') plt.close() return True
def __init__(self, tokens, name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens if name: self.name = name elif ']' in tokens[:20]: end = tokens[:20].index(']') self.name = " ".join(text_type(tok) for tok in tokens[1:end]) else: self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
def guess_encoding(data): """ Given a byte string, attempt to decode it. Tries the standard 'UTF8' and 'latin-1' encodings, Plus several gathered from locale information. The calling program *must* first call:: locale.setlocale(locale.LC_ALL, '') If successful it returns ``(decoded_unicode, successful_encoding)``. If unsuccessful it raises a ``UnicodeError``. """ successful_encoding = None # we make 'utf-8' the first encoding encodings = ["utf-8"] # # next we add anything we can learn from the locale try: encodings.append(locale.nl_langinfo(locale.CODESET)) except AttributeError: pass try: encodings.append(locale.getlocale()[1]) except (AttributeError, IndexError): pass try: encodings.append(locale.getdefaultlocale()[1]) except (AttributeError, IndexError): pass # # we try 'latin-1' last encodings.append("latin-1") for enc in encodings: # some of the locale calls # may have returned None if not enc: continue try: decoded = text_type(data, enc) successful_encoding = enc except (UnicodeError, LookupError): pass else: break if not successful_encoding: raise UnicodeError( "Unable to decode input data. Tried the following encodings: %s." % ", ".join([repr(enc) for enc in encodings if enc]) ) else: return (decoded, successful_encoding)
def plot_phrase_discontinuous_distortion(): ''' phrase discontinuous distortion :return: ''' phrase_discont_distance_lr_l = pickle.load( open('phrase_discont_distance_lr_l.p', 'rb')) phrase_discont_distance_lr_r = pickle.load( open('phrase_discont_distance_lr_r.p', 'rb')) phrase_discont_distance_rl_l = pickle.load( open('phrase_discont_distance_rl_l.p', 'rb')) phrase_discont_distance_rl_r = pickle.load( open('phrase_discont_distance_rl_r.p', 'rb')) means = [ np.mean(phrase_discont_distance_lr_l), np.mean(phrase_discont_distance_lr_r), np.mean(phrase_discont_distance_rl_l), np.mean(phrase_discont_distance_rl_r) ] stds = [ np.std(phrase_discont_distance_lr_l), np.std(phrase_discont_distance_lr_r), np.std(phrase_discont_distance_rl_l), np.std(phrase_discont_distance_rl_r) ] print 'Means: ', means print 'Stds: ', stds labels = [ 'Left2Right\nLeft', 'Left2Right\nRight', 'Right2Left\nLeft', 'Right2Left\nRight' ] plt.errorbar(np.array(range(len(means))) + .9, means, stds, marker='o', linestyle='None', \ ecolor='#5f9ed1', mfc='#5f9ed1', mec='None', label='Young') # plt.axhline(y=0, color='grey', linestyle='--', alpha=0.5) plt.xticks( np.array(range(len(means))) + .9, [compat.text_type(s.replace(' ', '\n')) for s in labels]) plt.yticks(range(1, 21), [str(i) for i in range(1, 21)]) plt.xlabel('Phrase discontinuous events') plt.ylabel('Distortion') plt.ylim((1, 20)) plt.savefig('phrase_discont_dist', dpi=100, bbox_inches='tight') plt.close() return True
def guess_encoding(data): """ Given a byte string, attempt to decode it. Tries the standard 'UTF8' and 'latin-1' encodings, Plus several gathered from locale information. The calling program *must* first call:: locale.setlocale(locale.LC_ALL, '') If successful it returns ``(decoded_unicode, successful_encoding)``. If unsuccessful it raises a ``UnicodeError``. """ successful_encoding = None # we make 'utf-8' the first encoding encodings = ['utf-8'] # # next we add anything we can learn from the locale try: encodings.append(locale.nl_langinfo(locale.CODESET)) except AttributeError: pass try: encodings.append(locale.getlocale()[1]) except (AttributeError, IndexError): pass try: encodings.append(locale.getdefaultlocale()[1]) except (AttributeError, IndexError): pass # # we try 'latin-1' last encodings.append('latin-1') for enc in encodings: # some of the locale calls # may have returned None if not enc: continue try: decoded = text_type(data, enc) successful_encoding = enc except (UnicodeError, LookupError): pass else: break if not successful_encoding: raise UnicodeError( 'Unable to decode input data. Tried the following encodings: %s.' % ', '.join([repr(enc) for enc in encodings if enc])) else: return (decoded, successful_encoding)
def plot_word_discontinuous_distance(): # word discontinuous distance phrase_discont_dist_LR_l = pickle.load( open('../pickled/word_discont_dist_LR_l.pickle', 'rb')) phrase_discont_dist_LR_r = pickle.load( open('../pickled/word_discont_dist_LR_r.pickle', 'rb')) phrase_discont_dist_RL_l = pickle.load( open('../pickled/word_discont_dist_RL_l.pickle', 'rb')) phrase_discont_dist_RL_r = pickle.load( open('../pickled/word_discont_dist_RL_r.pickle', 'rb')) means = [ np.mean(phrase_discont_dist_LR_l), np.mean(phrase_discont_dist_LR_r), np.mean(phrase_discont_dist_RL_l), np.mean(phrase_discont_dist_RL_r) ] stds = [ np.std(phrase_discont_dist_LR_l), np.std(phrase_discont_dist_LR_r), np.std(phrase_discont_dist_RL_l), np.std(phrase_discont_dist_RL_r) ] print 'Means: ', means print 'Stds: ', stds labels = [ 'Left2Right\nLeft', 'Left2Right\nRight', 'Right2Left\nLeft', 'Right2Left\nRight' ] plt.errorbar(np.array(range(len(means))) + .9, means, stds, marker='o', linestyle='None', \ ecolor='#5f9ed1', mfc='#5f9ed1', mec='None', label='Young') plt.xticks( np.array(range(len(means))) + .9, [compat.text_type(s.replace(' ', '\n')) for s in labels]) plt.yticks(range(1, 21), [str(i) for i in range(1, 21)]) plt.xlabel('Word discontinuous events') plt.ylabel('Distance') plt.ylim((1, 20)) plt.savefig('../fig/word_discont_dist', dpi=100, bbox_inches='tight') plt.close() return True
def _tgrep_node_literal_value(node): ''' Gets the string value of a given parse tree node, for comparison using the tgrep node literal predicates. ''' return (node.label() if _istree(node) else text_type(node))