Пример #1
0
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.utils.nlp import polarity

sentences = [
    "So there is no way for me to plug it in here in the US unless I go by a converter.",
    "Good case, Excellent value.", "Works great!",
    'The design is very odd, as the ear "clip" is not very comfortable at all.',
    "Needless to say, I wasted my money."
]

# define preprocessing pipeline
text_processor = TextPreProcessor(
    fix_text=True,
    unpack_contractions=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
)

# pass each sentence through the pipeline
tokenized_sentences = list(text_processor.pre_process_docs(sentences))
for sent in tokenized_sentences:
    _polarity, _scores = polarity(sent)
    print("{:.4f}\t".format(_polarity) + " ".join(sent))
Пример #2
0
    def yelpInstanceConstructFromTrain(self, paramFpathInTrainTxt,
                                       paramFpathOutToken2IndexDict,
                                       paramFpathOutIndex2TokenDict,
                                       paramFpathOutTrainParams,
                                       paramFpathOutTrainInstance):
        '''
        combine reviews with stars, reshuffle reviews, and split into two sets
        ===================================================
        parameters:
        -----------
        paramFpathInTrainTxt: review texted train
        paramFpathOutToken2IndexDict: map token to index
        paramFpathOutIndex2TokenDict: map index to token
        paramFpathOutTest: test se
        paramFpathOutParams: the parameters needed for training
        paramTrainsetPercent: train set percent

        return:
        -----------
        None
        '''

        # read in the train.txt
        fpointerInTrainTxt = open(paramFpathInTrainTxt, 'rt', encoding='utf8')

        def __function4map(elem4map):
            '''
            stripe elem
            ===================================================
            parameters:
            -----------
            elem4map

            return:
            -----------
            mapped elem
            '''
            elemstriped = elem4map.strip()
            return elemstriped

        listTrainTxt = list(map(__function4map,
                                fpointerInTrainTxt.readlines()))
        fpointerInTrainTxt.close()

        # ----------initialize TextPreProcessor
        text_processor = TextPreProcessor(
            normailze=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'date', 'number'
            ],
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", "emphasis",
                "censored"
            },
            fix_html=True,
            segmenter="english",
            corrector="english",
            unpack_hashtags=True,
            unpack_contractions=True,
            spell_correct_elong=False,
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons])
        # ----------Initialize TextPreProcessor

        listTrainTxtTokenized = \
            list(text_processor.pre_process_docs(listTrainTxt))
        listTrainTxt = None
        # ----------save the vocabulary table,
        #           calculate and save the parameters
        # filter top 20,000 tokens
        dictVocabulary2Freq = dict()
        for listTokens in listTrainTxtTokenized:
            for aToken in listTokens:
                if aToken in dictVocabulary2Freq:
                    dictVocabulary2Freq[aToken] += 1
                else:
                    dictVocabulary2Freq[aToken] = 1
        itemgetter1 = operator.itemgetter(1)
        list_k_v_top_20000 = sorted(dictVocabulary2Freq.items(),
                                    key=itemgetter1,
                                    reverse=True)[0:20000]
        dict_k_v_top_20000 = {k: v for k, v in list_k_v_top_20000}
        dictVocabulary2Freq = None
        list_k_v_top_20000 = None

        # calculate maxDocumentSize and vocabularySize
        maxDocumentSize = 0
        vocabularySize = 0

        dictVocabulary2Index = dict()
        dictIndex2Vocabulary = dict()
        tokenCurrentIndex = 0
        for listTokens in listTrainTxtTokenized:
            if maxDocumentSize < len(listTokens):
                maxDocumentSize = len(listTokens)
            for aToken in listTokens:
                # filter rare words, reduce vocabulary size
                if aToken not in dict_k_v_top_20000:
                    continue
                if aToken in dictVocabulary2Index:
                    pass
                else:
                    dictVocabulary2Index[aToken] = tokenCurrentIndex
                    dictIndex2Vocabulary[tokenCurrentIndex] = aToken
                    tokenCurrentIndex += 1
        vocabularySize = tokenCurrentIndex
        assert vocabularySize == len(dictVocabulary2Index)

        # trim doc_size to 0.5 maxDocSize
        # trimmed_doc_size = maxDocumentSize * 0.5

        # json write using the fp4jsonoutput = open(,'wt', encoding='utf8')
        fp4jsonoutput = open(paramFpathOutToken2IndexDict,
                             'wt',
                             encoding='utf8')
        json.dump(dictVocabulary2Index, fp4jsonoutput, ensure_ascii=False)
        fp4jsonoutput.close()

        fp4jsonoutput = open(paramFpathOutIndex2TokenDict,
                             'wt',
                             encoding='utf8')
        json.dump(dictIndex2Vocabulary, fp4jsonoutput, ensure_ascii=False)
        fp4jsonoutput.close()

        # dictVocabulary2Index = None
        dictIndex2Vocabulary = None

        fpointerOutParams = open(paramFpathOutTrainParams,
                                 'wt',
                                 encoding='utf8')

        str4write = 'TrainingInstances: %d\n' % len(listTrainTxtTokenized)\
            + 'DocumentSeqLen: %d\n' % maxDocumentSize\
            + 'VocabularySize: %d\n' % vocabularySize

        fpointerOutParams.write(str4write)

        fpointerOutParams.close()
        # ----------calculate and save the parameters

        # ----------construct training instances and perform padding
        print('Hello1')

        def __function_tokenlist_to_traininstance(tokenlist):
            '''
            from tokenlist to padded instance list
            adding subsampling
            '''
            tokenlist_size = len(tokenlist)
            traininginstance = list()
            for n in range(tokenlist_size):
                # ----------split tokenlist section
                tokenlist_section = None
                if n - HALF_WINDOW_SIZE < 0:
                    if n + HALF_WINDOW_SIZE >= tokenlist_size:
                        tokenlist_section = tokenlist
                    else:
                        tokenlist_section = tokenlist[:n + HALF_WINDOW_SIZE]
                else:
                    if n + HALF_WINDOW_SIZE >= tokenlist_size:
                        tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:]
                    else:
                        tokenlist_section = tokenlist[n - HALF_WINDOW_SIZE:n +
                                                      HALF_WINDOW_SIZE]
                # ----------calculate tokenlist multiterm
                countlist_vocab = [0 for i in range(vocabularySize)]
                countlist_vocab[dictVocabulary2Index[tokenlist[n]]] += 1
                traininginstance.append(countlist_vocab)
                countlist_vocab = [0 for i in range(vocabularySize)]
                for atoken in tokenlist_section:
                    countlist_vocab[dictVocabulary2Index[atoken]] += 1
                traininginstance.append(countlist_vocab)

            # ----------padding
            for n in range(tokenlist_size, maxDocumentSize):
                fullzero_vocab = [0 for i in range(vocabularySize)]
                traininginstance.append(fullzero_vocab)
                fullzero_vocab = [0 for i in range(vocabularySize)]
                traininginstance.append(fullzero_vocab)

            return traininginstance

        def __function_traininstance_to_string(traininstance):
            '''
            from traininstance to a string
            '''
            str_training_instance = ''
            for acountlist_vocab in traininstance:
                acountlist_vocab = list(map(str, acountlist_vocab))
                str_acountlist_vocab = ' '.join(acountlist_vocab)
                str_training_instance += ' ' + str_acountlist_vocab

            str_training_instance += '\n'
            return str_training_instance

        fpointerOutTrainInstance = open(paramFpathOutTrainInstance,
                                        'wt',
                                        encoding='utf8')
        for aTrainTxtTokenized in listTrainTxtTokenized:
            aTrainInstance = __function_tokenlist_to_traininstance(
                aTrainTxtTokenized)
            aStrTrainInstance = __function_traininstance_to_string(
                aTrainInstance)
            fpointerOutTrainInstance.write(aStrTrainInstance)
        fpointerOutTrainInstance.close()

        return None
Пример #3
0
def compute_elmo_rep(model_dir, input_list, mtype='BiLSTMAttention'):
    '''
    Given a list of documents, 
    return a list of embedded documents
    each element in list is [sentence len] * [word embedding dim]
    '''
    config = DefaultConfig(
    )  # Just take the default config to do the prediction work
    config.set_attrs({'batch_size': 8})
    model_path = '%s/model' % model_dir

    text_processor = TextPreProcessor(
        normailze=[
            'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
            'date', 'number'
        ],
        annotate={
            "hashtag", "allcaps", "elongated", "repeated", "emphasis",
            "censored"
        },
        fix_html=True,
        segmenter="english",
        corrector="english",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons])

    listTokenized = list(text_processor.pre_process_docs(input_list))
    print('After tokenization:')
    print(listTokenized)

    tensorTokenizedCharEncoded = batch_to_ids(
        listTokenized
    )  #[ ['I', 'am', 'a' ,'sentense'] , ['A','sentense'] ] )#listShuffledReviewsTokenized )
    # print( listShuffledReviewsCharacterEmbedded[0].size() )

    arrayTokenizedCharEncoded = tensorTokenizedCharEncoded.numpy().astype(
        numpy.int32)

    x = Variable(torch.from_numpy(arrayTokenizedCharEncoded).long(),
                 requires_grad=False)

    if config.on_cuda:
        x = x.cuda()
    else:
        x = x.cpu()

    #print(x.size())

    model = biLSTMAttention.BiLSTMAttention(
        param_document_seq_len=tensorTokenizedCharEncoded.size(
            1),  # 300 in our model
        param_character_embedding_len=tensorTokenizedCharEncoded.size(
            2),  #it depends on the setting
        param_bilstm_hidden_size=1024 //
        2,  # 1024 is the Elmo size, the concatenated hidden size is supposed to Elmo size, however, any size is OK
        param_attention_size=(1024 // 2 * 2) // 1024 * 1024 + (1024 // 2 * 2) %
        1024,  # attention size should be a smoothed representation of character-emb
        param_class_count=5,
        param_options_file=config.options_file,
        param_weight_file=config.weight_file)
    print('Loading trained model')

    # here, load and save are defined in biLSTMAttention.py
    # load <=> model.load_state_dict( torch.load(path) )
    # save <=> torch.save( model.state_dict(), path )

    # an other way:
    # model = torch.load( path ) # has 2 field, if torch.save( model, path ), then both ['state_dict'] and ['struct'] != None
    # torch.save( model, path )

    if config.on_cuda:
        model.load(model_path)
        model = model.cuda()
    else:
        model.load_cpu_from_gputrained(model_path)
        model = model.cpu()

    elmo_dict = model.forward_obtainTrainedElmoRep(x)

    elmo_rep = elmo_dict['elmo_representations'][
        0]  # since num_output_representations = 1, so len(list_elmo_rep) = 1,
    # if num_output_representations == 2, then will produce 2 same elmo_representations of [batch_size, seq_len, wordembedding_len]

    #print(elmo_rep.size())
    arr_elmo_rep = elmo_rep.data.cpu().numpy()

    return arr_elmo_rep