Пример #1
0
def createInstanceObjectList(processed_dataset):
    '''
    Creates a list of Instance objects from the tokenized input + label
    '''
    print('Reading instances...')
    instanceObjects = []

    #Els: read in tokenised lines
    #processed_data = []
    for item in processed_dataset:
        tokenized = []
        line = item[1]
        data = line.strip('\n')
        if data:
            all_words = word_tokenize(data)
            content = ' '.join([str(elem) for elem in all_words])
        label = item[2]
        #processed_data.append(tokenized + '\t' + str(label))
        instanceObject = Instance(content, label)
        for i, token in enumerate(content.split()):
            instanceObject.tokenDictionary[i + 1] = Token(token)
        if FeatureSelection.getInstance(featureFile).normalizeInstances:
            instanceObject.tokenDictionary = instanceObject.normalizeTokens()
        instanceObjects.append(instanceObject)
    return instanceObjects
Пример #2
0
def createInstanceObjectList(inputFileName):
    '''
    Creates a list of Instance objects from the input file, which contain all the (linguistic) information
    needed to extract the features for sentiment polarity classification
    '''
    print ('Reading instances...')
    instanceObjects = []
    with codecs.open(inputFileName, 'r', 'utf8') as inputFile:
    	for line in inputFile:
            content, label = line.strip().split('\t')
            instanceObject = Instance(content, label)
            for i, token in enumerate(content.split()):
                instanceObject.tokenDictionary[i+1] = Token(token)
            if FeatureSelection.getInstance(featureFile).normalizeInstances:
                instanceObject.tokenDictionary = instanceObject.normalizeTokens()
            instanceObjects.append(instanceObject)
    return instanceObjects