示例#1
0
文件: text.py 项目: Olivia-J/corpus
    def __init__(self, passage, lemma_passage,name=None):
        """
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        """
    #    print(passage)
        passage = passage.replace("\n", " ")
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        try:
         sentences = sent_detector.tokenize(passage.strip())
        except:
         passage=passage.decode('utf-8')
         sentences = sent_detector.tokenize(passage.strip())
        lemma_sentences=sent_detector.tokenize(lemma_passage.strip())
        tokens = nltk.word_tokenize(lemma_passage)
        if self._COPY_TOKENS:
            tokens = list(tokens)
        self.tokens = tokens
        
        self.sentences=sentences
        self.lemma_sentences=lemma_sentences
        if name:
            self.name = name
        elif ']' in tokens[:20]:
            end = tokens[:20].index(']')
            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
        else:
            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
示例#2
0
文件: text.py 项目: Olivia-J/corpus
    def __init__(self, passage, lemma_passage, name=None):
        """
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        """
        #    print(passage)
        passage = passage.replace("\n", " ")
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        try:
            sentences = sent_detector.tokenize(passage.strip())
        except:
            passage = passage.decode('utf-8')
            sentences = sent_detector.tokenize(passage.strip())
        lemma_sentences = sent_detector.tokenize(lemma_passage.strip())
        tokens = nltk.word_tokenize(lemma_passage)
        if self._COPY_TOKENS:
            tokens = list(tokens)
        self.tokens = tokens

        self.sentences = sentences
        self.lemma_sentences = lemma_sentences
        if name:
            self.name = name
        elif ']' in tokens[:20]:
            end = tokens[:20].index(']')
            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
        else:
            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
示例#3
0
 def __repr__(self):
     """
     Return a string representation for this corpus view that is
     similar to a list's representation; but if it would be more
     than 60 characters long, it is truncated.
     """
     pieces = []
     length = 5
     for elt in self:
         pieces.append(repr(elt))
         length += len(pieces[-1]) + 2
         if length > self._MAX_REPR_SIZE and len(pieces) > 2:
             return '[%s, ...]' % text_type(', ').join(pieces[:-1])
     else:
         return '[%s]' % text_type(', ').join(pieces)
示例#4
0
 def __repr__(self):
     """
     Return a string representation for this corpus view that is
     similar to a list's representation; but if it would be more
     than 60 characters long, it is truncated.
     """
     pieces = []
     length = 5
     for elt in self:
         pieces.append(repr(elt))
         length += len(pieces[-1]) + 2
         if length > self._MAX_REPR_SIZE and len(pieces) > 2:
             return '[%s, ...]' % text_type(', ').join(pieces[:-1])
     else:
         return '[%s]' % text_type(', ').join(pieces)
def plot_word_discontinuous_distortion():
    '''
    word discontinuous distortion
    :return:
    '''
    phrase_discont_distance_lr_l = pickle.load(open('word_discont_distance_lr_l.p','rb'))
    phrase_discont_distance_lr_r = pickle.load(open('word_discont_distance_lr_r.p','rb'))
    phrase_discont_distance_rl_l = pickle.load(open('word_discont_distance_rl_l.p','rb'))
    phrase_discont_distance_rl_r = pickle.load(open('word_discont_distance_rl_r.p','rb'))

    means = [np.mean(phrase_discont_distance_lr_l), np.mean(phrase_discont_distance_lr_r),
            np.mean(phrase_discont_distance_rl_l), np.mean(phrase_discont_distance_rl_r)]

    stds = [np.std(phrase_discont_distance_lr_l), np.std(phrase_discont_distance_lr_r),
            np.std(phrase_discont_distance_rl_l), np.std(phrase_discont_distance_rl_r)]

    print 'Means: ', means
    print 'Stds: ', stds

    labels = ['Left2Right\nLeft', 'Left2Right\nRight', 'Right2Left\nLeft', 'Right2Left\nRight']

    plt.errorbar(np.array(range(len(means))) + .9, means, stds, marker='o', linestyle='None', \
                 ecolor='#5f9ed1', mfc='#5f9ed1', mec='None', label='Young')
    # plt.axhline(y=0, color='grey', linestyle='--', alpha=0.5)
    plt.xticks(np.array(range(len(means))) + .9, [compat.text_type(s.replace(' ', '\n')) for s in labels])
    plt.yticks(range(1,21), [str(i) for i in range(1,21)])
    plt.xlabel('Word discontinuous events')
    plt.ylabel('Distortion')

    plt.ylim((1,20))

    plt.savefig('word_discont_dist', dpi=100, bbox_inches='tight')
    plt.close()

    return True
示例#6
0
    def __init__(self, tokens, name=None):
        """
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        """
        if self._COPY_TOKENS:
            tokens = list(tokens)
        self.tokens = tokens

        if name:
            self.name = name
        elif ']' in tokens[:20]:
            end = tokens[:20].index(']')
            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
        else:
            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
示例#7
0
    def __init__(self, tokens, name=None):
        """
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        """
        if self._COPY_TOKENS:
            tokens = list(tokens)
        self.tokens = tokens

        if name:
            self.name = name
        elif ']' in tokens[:20]:
            end = tokens[:20].index(']')
            self.name = " ".join(text_type(tok) for tok in tokens[1:end])
        else:
            self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
示例#8
0
def guess_encoding(data):
    """
    Given a byte string, attempt to decode it.
    Tries the standard 'UTF8' and 'latin-1' encodings,
    Plus several gathered from locale information.

    The calling program *must* first call::

        locale.setlocale(locale.LC_ALL, '')

    If successful it returns ``(decoded_unicode, successful_encoding)``.
    If unsuccessful it raises a ``UnicodeError``.
    """
    successful_encoding = None
    # we make 'utf-8' the first encoding
    encodings = ["utf-8"]
    #
    # next we add anything we can learn from the locale
    try:
        encodings.append(locale.nl_langinfo(locale.CODESET))
    except AttributeError:
        pass
    try:
        encodings.append(locale.getlocale()[1])
    except (AttributeError, IndexError):
        pass
    try:
        encodings.append(locale.getdefaultlocale()[1])
    except (AttributeError, IndexError):
        pass
    #
    # we try 'latin-1' last
    encodings.append("latin-1")
    for enc in encodings:
        # some of the locale calls
        # may have returned None
        if not enc:
            continue
        try:
            decoded = text_type(data, enc)
            successful_encoding = enc

        except (UnicodeError, LookupError):
            pass
        else:
            break
    if not successful_encoding:
        raise UnicodeError(
            "Unable to decode input data.  Tried the following encodings: %s."
            % ", ".join([repr(enc) for enc in encodings if enc])
        )
    else:
        return (decoded, successful_encoding)
示例#9
0
def plot_phrase_discontinuous_distortion():
    '''
    phrase discontinuous distortion
    :return:
    '''
    phrase_discont_distance_lr_l = pickle.load(
        open('phrase_discont_distance_lr_l.p', 'rb'))
    phrase_discont_distance_lr_r = pickle.load(
        open('phrase_discont_distance_lr_r.p', 'rb'))
    phrase_discont_distance_rl_l = pickle.load(
        open('phrase_discont_distance_rl_l.p', 'rb'))
    phrase_discont_distance_rl_r = pickle.load(
        open('phrase_discont_distance_rl_r.p', 'rb'))

    means = [
        np.mean(phrase_discont_distance_lr_l),
        np.mean(phrase_discont_distance_lr_r),
        np.mean(phrase_discont_distance_rl_l),
        np.mean(phrase_discont_distance_rl_r)
    ]

    stds = [
        np.std(phrase_discont_distance_lr_l),
        np.std(phrase_discont_distance_lr_r),
        np.std(phrase_discont_distance_rl_l),
        np.std(phrase_discont_distance_rl_r)
    ]

    print 'Means: ', means
    print 'Stds: ', stds

    labels = [
        'Left2Right\nLeft', 'Left2Right\nRight', 'Right2Left\nLeft',
        'Right2Left\nRight'
    ]

    plt.errorbar(np.array(range(len(means))) + .9, means, stds, marker='o', linestyle='None', \
                 ecolor='#5f9ed1', mfc='#5f9ed1', mec='None', label='Young')
    # plt.axhline(y=0, color='grey', linestyle='--', alpha=0.5)
    plt.xticks(
        np.array(range(len(means))) + .9,
        [compat.text_type(s.replace(' ', '\n')) for s in labels])
    plt.yticks(range(1, 21), [str(i) for i in range(1, 21)])
    plt.xlabel('Phrase discontinuous events')
    plt.ylabel('Distortion')

    plt.ylim((1, 20))

    plt.savefig('phrase_discont_dist', dpi=100, bbox_inches='tight')
    plt.close()

    return True
示例#10
0
文件: util.py 项目: aczapata/twitter
def guess_encoding(data):
    """
    Given a byte string, attempt to decode it.
    Tries the standard 'UTF8' and 'latin-1' encodings,
    Plus several gathered from locale information.

    The calling program *must* first call::

        locale.setlocale(locale.LC_ALL, '')

    If successful it returns ``(decoded_unicode, successful_encoding)``.
    If unsuccessful it raises a ``UnicodeError``.
    """
    successful_encoding = None
    # we make 'utf-8' the first encoding
    encodings = ['utf-8']
    #
    # next we add anything we can learn from the locale
    try:
        encodings.append(locale.nl_langinfo(locale.CODESET))
    except AttributeError:
        pass
    try:
        encodings.append(locale.getlocale()[1])
    except (AttributeError, IndexError):
        pass
    try:
        encodings.append(locale.getdefaultlocale()[1])
    except (AttributeError, IndexError):
        pass
    #
    # we try 'latin-1' last
    encodings.append('latin-1')
    for enc in encodings:
        # some of the locale calls
        # may have returned None
        if not enc:
            continue
        try:
            decoded = text_type(data, enc)
            successful_encoding = enc

        except (UnicodeError, LookupError):
            pass
        else:
            break
    if not successful_encoding:
        raise UnicodeError(
            'Unable to decode input data.  Tried the following encodings: %s.'
            % ', '.join([repr(enc) for enc in encodings if enc]))
    else:
        return (decoded, successful_encoding)
示例#11
0
def plot_word_discontinuous_distance():
    # word discontinuous distance
    phrase_discont_dist_LR_l = pickle.load(
        open('../pickled/word_discont_dist_LR_l.pickle', 'rb'))
    phrase_discont_dist_LR_r = pickle.load(
        open('../pickled/word_discont_dist_LR_r.pickle', 'rb'))
    phrase_discont_dist_RL_l = pickle.load(
        open('../pickled/word_discont_dist_RL_l.pickle', 'rb'))
    phrase_discont_dist_RL_r = pickle.load(
        open('../pickled/word_discont_dist_RL_r.pickle', 'rb'))

    means = [
        np.mean(phrase_discont_dist_LR_l),
        np.mean(phrase_discont_dist_LR_r),
        np.mean(phrase_discont_dist_RL_l),
        np.mean(phrase_discont_dist_RL_r)
    ]

    stds = [
        np.std(phrase_discont_dist_LR_l),
        np.std(phrase_discont_dist_LR_r),
        np.std(phrase_discont_dist_RL_l),
        np.std(phrase_discont_dist_RL_r)
    ]

    print 'Means: ', means
    print 'Stds: ', stds

    labels = [
        'Left2Right\nLeft', 'Left2Right\nRight', 'Right2Left\nLeft',
        'Right2Left\nRight'
    ]

    plt.errorbar(np.array(range(len(means))) + .9, means, stds, marker='o', linestyle='None', \
                 ecolor='#5f9ed1', mfc='#5f9ed1', mec='None', label='Young')
    plt.xticks(
        np.array(range(len(means))) + .9,
        [compat.text_type(s.replace(' ', '\n')) for s in labels])
    plt.yticks(range(1, 21), [str(i) for i in range(1, 21)])
    plt.xlabel('Word discontinuous events')
    plt.ylabel('Distance')

    plt.ylim((1, 20))

    plt.savefig('../fig/word_discont_dist', dpi=100, bbox_inches='tight')
    plt.close()

    return True
示例#12
0
def _tgrep_node_literal_value(node):
    '''
    Gets the string value of a given parse tree node, for comparison
    using the tgrep node literal predicates.
    '''
    return (node.label() if _istree(node) else text_type(node))
示例#13
0
def _tgrep_node_literal_value(node):
    '''
    Gets the string value of a given parse tree node, for comparison
    using the tgrep node literal predicates.
    '''
    return (node.label() if _istree(node) else text_type(node))