Python FeatureExtractionUtilities примеры использования

Язык программирования: Python

Пространство имен/Пакет: featureextractionmodules.FeatureExtractionUtilities

Примеров на hotexamples.com: 8

Python FeatureExtractionUtilities - 8 примеров найдено. Это лучшие примеры Python кода для featureextractionmodules.FeatureExtractionUtilities.FeatureExtractionUtilities, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

loadItems(3)

getsentimentfeatures(3)

generateSemVec(1)

getSynsetString(1)

getclusterfeatures(1)

getlexiconfeatures(1)

getstructuralfeatures(1)

gettopicscores(1)

goodbadFeatures(1)

Пример #1

Показать файл

Файл: ADRBinaryFeatureGenerators.py Проект: srensi/side-effect-NLP

def loadData(f_path):
    '''
        Given a path, loads a data set and puts it into a dataframe
    '''
    loaded_data_set = defaultdict(list)
    infile = open(f_path)
    for line in infile:
        line = line.decode('utf8', 'ignore').encode('ascii', 'ignore')
        try:
            items = line.split('\t')
            if len(items) > 3:
                tweet_id = items[0]
                user_id = items[1]
                text = string.lower(string.strip(items[-1]))
                class_ = items[2]

                senttokens = text.split()  #nltk.word_tokenize(_text)
                stemmed_text = ''
                for t in senttokens:
                    stemmed_text += ' ' + stemmer.stem(t)

                loaded_data_set['id'].append(tweet_id + '-' + user_id)
                loaded_data_set['synsets'].append(
                    FeatureExtractionUtilities.getSynsetString(text, None))
                loaded_data_set['clusters'].append(
                    FeatureExtractionUtilities.getclusterfeatures(text))
                loaded_data_set['text'].append(stemmed_text)
                loaded_data_set['unstemmed_text'].append(text)
                loaded_data_set['class'].append(class_)

        except UnicodeDecodeError:
            print 'please convert to correct encoding..'

    infile.close()
    return loaded_data_set

Пример #2

Показать файл

Файл: data_helpers.py Проект: ferasodh/semvec

def generateSentementFeatures(x_text, max_length):
    sequence_length = max_length  #max(len(x) for x in x_text)
    stemmer = PorterStemmer()
    unstemmed_texts = []
    for line in x_text:
        # line = line.decode('utf8', 'ignore').encode('ascii', 'ignore')
        # try:
        # items = line.split('\t')
        # if len(items) > 3:
        # _drug = string.lower(string.strip(items[1]))
        # _text = string.lower(string.strip(items[-1]))
        # _class = int(string.strip(items[2]))
        senttokens = nltk.word_tokenize(line)
        stemmed_text = ''
        for t in senttokens:
            stemmed_text += ' ' + stemmer.stem(t)
        unstemmed_texts.append(line)

        # training_set['synsets'].append(FeatureExtractionUtilities.getSynsetString(_text, None))
        # print 'synsets==>'+str(training_set['synsets'])
        # training_set['clusters'].append(FeatureExtractionUtilities.getclusterfeatures(_text))
        # print 'clusters==>' + str(training_set['clusters'])
        # training_set['text'].append(stemmed_text)
        # training_set['class'].append(_class)
        # print 'class==>' + str(training_set['class'])
        # except UnicodeDecodeError:
        #     print 'please convert to correct encoding..'

    # infile.close()
    # a = np.zeros((len(x_text), 90), dtype=np.float64).tolist()
    # trained_data = np.concatenate((np.array(trained_data),a),axis=1)

    print 'Generating training set sentiment features .. '
    sentiments = FeatureExtractionUtilities.getsentimentfeatures(
        unstemmed_texts, sequence_length)
    return sentiments

Пример #3

Показать файл

        arousalScore = numpy.load("dump/" + data_set + "/arousal_feature_list.npy")
        dominanceScore = numpy.load("dump/" + data_set + "/dominance_feature_list.npy")
        # print sentimentScore.shape
    else:
        # adeExact = numpy.load("dump/" + data_set + "/ade-exact.npy")
        adeNoStopWords = numpy.load("dump/" + data_set + "/ade-no-stopwords.npy")

    # x_text, Y = data_helpers.load_data_and_labels("./data/rt-polaritydata/sts-gold.pos",
    #                                                 "./data/rt-polaritydata/sts-gold.neg")

    print Y.shape
else:
    # file = open("testfile.txt", "wb")
    x_text, Y = data_helpers.load_data_and_y_labels("./data/MR/rt-polarity.pos",
                                                    "./data/MR/rt-polarity.neg")
    FeatureExtractionUtilities.loadItems()

    max_document_length = 32

    s, negScore, posScore, adeScore, subjScore, pposScore, nnegScore, moreGoodScore, moreBadScore, lessBadScore, lessGoodScore, sentence_cluster, sentence_cluster2, wordLength, wordOrder = FeatureExtractionUtilities.generateSemVec(
        x_text, max_document_length, embed_size=1)

    s = numpy.array(s)
    negScore = numpy.array(negScore)
    posScore = numpy.array(posScore)
    adeScore = numpy.array(adeScore)
    subjScore = numpy.array(subjScore)
    pposScore = numpy.array(pposScore)
    nnegScore = numpy.array(nnegScore)
    moreGoodScore = numpy.array(moreGoodScore)
    moreBadScore = numpy.array(moreBadScore)

Пример #4

Показать файл

Файл: ADRBinaryFeatureGenerators.py Проект: srensi/side-effect-NLP

def loadFeatureExtractionModuleItems():
    '''
        Load the various feature extraction resources
    '''
    FeatureExtractionUtilities.loadItems()

Пример #5

Показать файл

Файл: ADRBinaryFeatureGenerators.py Проект: srensi/side-effect-NLP

    return loaded_data_set


if __name__ == '__main__':
    #LOAD THE FEATURE EXTRACTION RESOURCES
    loadFeatureExtractionModuleItems()

    #LOAD THE DATA -- *SAMPLE SCRIPT USES THE SAME DATA FOR TRAINING AND TESTING*
    data_set_filename = 'binary_downloaded.tsv'
    training_data = loadData(data_set_filename)
    testing_data = loadData(data_set_filename)

    #GENERATE THE TRAINING SET FEATURES
    print 'GENERATING TRAINING SET FEATURES.. '
    training_data[
        'sentiments'] = FeatureExtractionUtilities.getsentimentfeatures(
            training_data['unstemmed_text'])
    training_data[
        'structuralfeatures'] = FeatureExtractionUtilities.getstructuralfeatures(
            training_data['unstemmed_text'])
    training_data[
        'adrlexicon'] = FeatureExtractionUtilities.getlexiconfeatures(
            training_data['unstemmed_text'])
    training_data['topictexts'], training_data[
        'topics'] = FeatureExtractionUtilities.gettopicscores(
            training_data['text'])
    training_data['goodbad'] = FeatureExtractionUtilities.goodbadFeatures(
        training_data['text'])

    #SCALE THE STRUCTURAL FEATURES
    scaler1 = preprocessing.StandardScaler().fit(
        training_data['structuralfeatures'])

Пример #6

Показать файл

Файл: ADRBinaryFeatureGenerators.py Проект: ferasodh/semvec

def loadFeatureExtractionModuleItems():
    FeatureExtractionUtilities.loadItems()

Пример #7

Показать файл

Файл: ADRBinaryFeatureGenerators.py Проект: ferasodh/semvec

                # training_set['synsets'].append(FeatureExtractionUtilities.getSynsetString(_text, None))
                # print 'synsets==>'+str(training_set['synsets'])
                # training_set['clusters'].append(FeatureExtractionUtilities.getclusterfeatures(_text))
                # print 'clusters==>' + str(training_set['clusters'])
                training_set['text'].append(stemmed_text)
                training_set['class'].append(_class)
                # print 'class==>' + str(training_set['class'])
        except UnicodeDecodeError:
            print 'please convert to correct encoding..'

    infile.close()

    print 'Generating training set sentiment features .. '
    training_set[
        'sentiments'] = FeatureExtractionUtilities.getsentimentfeatures(
            unstemmed_texts)
    # print 'training_set[sentiments]'+str(training_set['sentiments'])
    # training_set['structuralfeatures'] = FeatureExtractionUtilities.getstructuralfeatures(unstemmed_texts)
    # print 'training_set[structuralfeatures]'+str(training_set['structuralfeatures'])
    # scaler1 = preprocessing.StandardScaler().fit( training_set['structuralfeatures'])
    # train_structural_features = scaler1.transform( training_set['structuralfeatures'])
    # training_set['adrlexicon'] = FeatureExtractionUtilities.getlexiconfeatures(unstemmed_texts)
    # print 'adrlexicon==>' + str(training_set['adrlexicon'])
    # training_set['topictexts'],training_set['topics'] = FeatureExtractionUtilities.gettopicscores(training_set['text'])
    # print 'topictexts==>' + str(training_set['topictexts'])
    # training_set['goodbad'] = FeatureExtractionUtilities.goodbadFeatures(training_set['text'])
    # print 'goodbad==>' + str(training_set['goodbad'])
    '''
        Initialize the vectorizers
    '''
    print 'Initialize the vectorizers..'

Пример #8

Показать файл

                    FeatureExtractionUtilities.sentiposscores[(string.lower(
                        str(pos_tags[i][0])), 'n')])
                score = 200 * posscore
                sentence_pos[i] = score

            if FeatureExtractionUtilities.sentinegscores.has_key(
                (string.lower(str(pos_tags[i][0])), 'n')):
                negscore = float(
                    FeatureExtractionUtilities.sentinegscores[(string.lower(
                        str(pos_tags[i][0])), 'n')])
                score = 100 * negscore
                sentence_neg[i] = score
    return (sentence_pos, sentence_neg)


FeatureExtractionUtilities.loadItems()

x_text, Y = load_semeval_and_y_labels(
    "data/SemEval2015-task10-test-B-input.txt")

pos_feature_list = []
neg_feature_list = []

p = Pool(4)
for (pos_list, neg_list) in p.map(generate_senti_features, x_text):
    pos_feature_list.append(pos_list)
    neg_feature_list.append(neg_list)

pos_feature_list = numpy.expand_dims(pos_feature_list, axis=2)
neg_feature_list = numpy.expand_dims(neg_feature_list, axis=2)