예제 #1
0
def trainClassifiers():
    argv = sys.argv[1:]

    extractFeatures(argv[0])

    trainXGBoost(FEATURES)
    print('\n\n')
    trainRandomForrest(FEATURES)
예제 #2
0
def applyRules(IDsFilename):
    """Uses rule based approach to classify the reviews from the given set."""
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH,
                                                   file=IDsFilename))

    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)

    # print("Loading reviews...")
    # dataSet = Corpus.loadCorpus(filename="training_set.pk")

    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs,
                                               dataSet.reviews)

    gold = dataSet.goldStandard
    classification = classify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)

    targets = []
    cls = []

    for ID, g in gold.items():
        targets.append(g)
        cls.append(classification[ID])

    showPerformance(targets, cls)
예제 #3
0
def applyRules(IDsFilename):
    """Uses rule based approach to classify the reviews from the given set."""
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, 
                                                    file=IDsFilename))
    
    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)
    
    # print("Loading reviews...")
    # dataSet = Corpus.loadCorpus(filename="training_set.pk")

    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs, 
                                                dataSet.reviews)

    gold = dataSet.goldStandard
    classification = classify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)
    
    targets = []
    cls = []

    for ID, g in gold.items():
        targets.append(g)
        cls.append(classification[ID])

    showPerformance(targets, cls)
예제 #4
0
def showFeatures(IDsFilename=REVIEW_IDS_FILENAME):
    corpus = Corpus(IDsFilename)
    features, featureVectors = extractFeatures(corpus.reviewIDs,
                                               corpus.reviews,
                                               features=None)

    showFeatureOccurrence(features, featureVectors)
예제 #5
0
def testARFFExport():
    """ Tests the functionality to export the features as a valid ARFF file."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    reviewIDs = ironicIDs + regularIDs
    # for review in reviews.values():
    #     print(review)

    features, featureVectors = extractFeatures(reviewIDs, reviews, features=None, createARFF=True)
예제 #6
0
def main():

    argv = sys.argv[1:]
    # print(argv)

    try:
        opts, args = getopt.getopt(argv, 'i:o:c:', [])
        print(opts)
    except getopt.GetoptError as err:
        print(str(err))
        sys.exit(2)

    inDir = ''
    outDir = ''
    classifier = ''

    for opt, arg in opts:
        if opt == '-i':
            if not os.path.exists(arg):
                print('Aborting, can not find infile:', arg)
                sys.exit(2)
            inDir = arg
        elif opt == '-o':
            outDir = arg
        else:
            if arg not in ('xgboost', 'randomforrest'):
                print(
                    'Aborting, invalid classifier must be "xgboost" or "randomforrest"'
                )
                sys.exit(2)
            classifier = arg

    extractFeatures(inDir)

    if classifier == 'randomforrest':
        if not os.path.exists(RANDOM_FORREST_CLASSIFIER):
            print('Could not find trained RandomForrestClassifier')
            sys.exit(2)
        testRandomForrest('features.csv', outDir)

    if classifier == 'xgboost':
        if not os.path.exists(XGBOOST_MODEL):
            print('Can not find trained XGBoost model')
            sys.exit(2)
        testXGBoost('features.csv', outDir)
예제 #7
0
def testARFFExport():
    """ Tests the functionality to export the features as a valid ARFF file."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    reviewIDs = ironicIDs + regularIDs
    # for review in reviews.values():
    #     print(review)
    
    features, featureVectors = extractFeatures(reviewIDs, reviews, 
                                                features=None, createARFF=True)
예제 #8
0
def testRules():
    """Uses rule based approach to classify reviews."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews)

    gold = {ID: reviews[ID].ironic for ID in ironicIDs + regularIDs}
    classification = ruleClassify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)
    showPerformance(gold, classification)
예제 #9
0
def testRules():
    """Uses rule based approach to classify reviews."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews)

    gold = {ID: reviews[ID].ironic for ID in ironicIDs + regularIDs}
    classification = ruleClassify(features, featureVectors)

    showFeatureOccurrence(features, featureVectors, gold, classification)
    showPerformance(gold, classification)
예제 #10
0
def applySingleRules(IDsFilename):
    """
    Should originally just apply one rule.
    Is now used to apply one feature to the given corpus.
    So it basically shows how often each feature occurs in ironic and regular 
    reviews.
    """
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, 
                                                    file=IDsFilename))
    
    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)
    print("Loading reviews...")
#   dataSet = Corpus.loadCorpus(filename="training_set.pk")
    # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk")


    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs, 
                                                dataSet.reviews)

    showFeatureOccurrence(features, featureVectors)

    gold = dataSet.goldStandard
    
    # decisiveFeatureNames = ["Scare quotes", 
    #                         "Positive star polarity discrepancy",
    #                         "Negative star polarity discrepancy",
    #                         "Positive Ppunctuation",
    #                         "Negative Ppunctuation",
    #                         "Streak of Positive Words",
    #                         "Ellipsis and Punctuation",
    #                         "Emoticon Happy", "Emoticon Laughing", 
    #                         "Emoticon Winking", "Emotion Tongue", 
    #                         "LoLAcroym", "GrinAcronym", "Onomatopoeia",
    #                         "Interrobang"]

    decisiveFeatureNames = [f.name for f in features]

    for d in decisiveFeatureNames:
        classification = classify(features, featureVectors, [d])

        targets = []
        cls = []

        for ID, g in gold.items():
            targets.append(g)
            cls.append(classification[ID])

        print("\nClassifying by rule: ", d)

        showPerformance(targets, cls)
예제 #11
0
def applySingleRules(IDsFilename):
    """
    Should originally just apply one rule.
    Is now used to apply one feature to the given corpus.
    So it basically shows how often each feature occurs in ironic and regular
    reviews.
    """
    print("Using the set at '{path}{file}'".format(path=CORPUS_PATH,
                                                   file=IDsFilename))

    print("Creating reviews...(this may take a while)")
    dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH)
    print("Loading reviews...")
    #   dataSet = Corpus.loadCorpus(filename="training_set.pk")
    # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk")

    print("Extracting features...")
    features, featureVectors = extractFeatures(dataSet.reviewIDs,
                                               dataSet.reviews)

    showFeatureOccurrence(features, featureVectors)

    gold = dataSet.goldStandard

    # decisiveFeatureNames = ["Scare quotes",
    #                         "Positive star polarity discrepancy",
    #                         "Negative star polarity discrepancy",
    #                         "Positive Ppunctuation",
    #                         "Negative Ppunctuation",
    #                         "Streak of Positive Words",
    #                         "Ellipsis and Punctuation",
    #                         "Emoticon Happy", "Emoticon Laughing",
    #                         "Emoticon Winking", "Emotion Tongue",
    #                         "LoLAcroym", "GrinAcronym", "Onomatopoeia",
    #                         "Interrobang"]

    decisiveFeatureNames = [f.name for f in features]

    for d in decisiveFeatureNames:
        classification = classify(features, featureVectors, [d])

        targets = []
        cls = []

        for ID, g in gold.items():
            targets.append(g)
            cls.append(classification[ID])

        print("\nClassifying by rule: ", d)

        showPerformance(targets, cls)
예제 #12
0
def evaluateClassifiers():
    argv = sys.argv[1:]

    # If dir is specified recompute features
    top_features = 0
    feat_selection = []
    if len(argv) == 1:
        if argv[0].isdigit():
            top_features = int(argv[0])
        else:
            extractFeatures(argv[0])

    if len(argv) == 2:
        top_features = int(argv[0])
        extractFeatures(argv[1])

    # Load features and split in test/train
    data = pd.read_csv(FEATURES)
    train, test = train_test_split(data, test_size=0.2, random_state=1)

    # Write as training and testing expect to read from file
    train.to_csv(EVAL_TRAIN)
    test.to_csv(EVAL_TEST)

    # Feature selection
    if top_features != 0:
        features = pd.read_csv('eval_forrest_feature_importance.csv',
                               names=['feature', 'importance'],
                               skiprows=1)
        features.sort_values(by=['importance'], ascending=False, inplace=True)
        feat_selection = features['feature'][:top_features].as_matrix()
        print("USED TOP {} FEATURES".format(len(feat_selection)),
              feat_selection)

    # Evaluate both classifiers
    evaluateRandomForrest(test, train, feat_selection)
    evaluateXGBoost(test, train, feat_selection)
def createARFF(class1, class2, arff_path, corpus_path=CORPUS_PATH+TRAIN_PATH):
    """
    Define features by their number (see features: feature_names) and create ARFF file.
    """
    corpus = Corpus(class1, class2,corpusPath=corpus_path)
            
    allConfig = range(5)
    
    featureConfigs = []
                        #  mode,feat,regEx,new_stack,bigram,sentiment,configuration
    featureConfigs.append(("specific",True,False,False,False,False,allConfig,"features_all_specific" + "_" + class1 + "_vs_" + class2))
    
    # combine each feature with each feature, e.g. IF stopword = true AND negation = true THEN combi = true.
    binary_combination=False
    # = createARFF -> wird erstellt oder "" -> wird nicht erstellt
    
    for mode, feat, regExp, new_stack, bigram, sentiment, config, createARFF in featureConfigs:
        createARFF_file = createARFF        
        features, featureVectors = extractFeatures(class1, class2, mode, arff_path, corpus.class1IDs + corpus.class2IDs, corpus.tweets, config, feat, regExp, new_stack, binary_combination, sentiment, bigram, createARFF_file)
예제 #14
0
def process_audio(request):
    file = request.FILES['speech']
    fs = FileSystemStorage()
    filename = "temp_%d.wav" % time.time()
    filename = fs.save(filename, file)
    feats = extractFeatures("%s/%s" % (fs.location, filename), scmc=True)
    f_label = gmm.test(feats, 256, settings.GMM_ROOT)
    # r = settings.RECOGNIZER
    # with sr.AudioFile("%s/%s" % (fs.location, filename)) as source:
    # 	audio = r.record(source)
    # transcript,score = r.recognize_sphinx(audio)
    ler, transcript = LER("%s/%s" % (fs.location, filename),
                          request.POST['transcript'])
    r_label = ler < settings.RECOGNITION_THRESH
    context = {
        "label": "passed" if r_label and f_label else "failed",
        "transcript": transcript,
        "score": ler
    }
    resp = json.dumps(context)
    # return render(resp, content_type='application/json')
    return render(request, "basic_index.html", context)
예제 #15
0
import pandas
import features
import classify
import cluster
import itertools

INPUT_FILE = "input.log"
FEATURES_FILE = "features.csv"
RESULT_FILE = "clusterResult.csv"
EXTRACTED_SECTIONS_FILE = "detectedSections.log"
LABEL_COLUMN_NAME = "label"
LINE_NUMBER_COLUMN_NAME = "line_number"

features.extractFeatures(inputFile=INPUT_FILE, outputFile=FEATURES_FILE)

unlabelledSet = pandas.read_csv(FEATURES_FILE, skipinitialspace=True, header=0)

unlabelledSet[LABEL_COLUMN_NAME] = pandas.DataFrame(
    classify.predict(unlabelledSet), columns=[LABEL_COLUMN_NAME])

#unlabelledSet[LABEL_COLUMN_NAME] = pandas.read_csv("labels.csv", skipinitialspace=True, header=0)

labelledSet = unlabelledSet

#classify.train(labelledSet, 1000)

#classify.evaluate(labelledSet, 1)

type10 = cluster.kMeans(10, labelledSet, columnPrefix="type")
type100 = cluster.kMeans(100, labelledSet, columnPrefix="type")
type1000 = cluster.kMeans(1000, labelledSet, columnPrefix="type")
예제 #16
0
#extract the data from csv
const.M_names, const.F_names, const.name_map = dataParser.get_data()
#extract the training/test sets
const.ngram_training_set = extract(config.data_extraction_size)
const.training_set = extract(config.training_size, labelled=True)
const.cv_set = extract(config.cv_set_size, labelled=True)
const.test_set = extract(config.test_set_size, labelled=True)
#determine most common ngrams

getCommonGrams(const.ngram_training_set)
get_suffixes(const.ngram_training_set)

const.featureCount += config.di_num + config.tri_num + config.last_letters + config.di_sufnum + config.tri_sufnum

const.X_train, const.y_train = extractFeatures(const.training_set)
const.X_cv, const.y_cv = extractFeatures(const.cv_set)
const.X_test, const.y_test = extractFeatures(const.test_set)

#--CLASSIFER--#

X_train = torch.stack([torch.tensor(i) for i in const.X_train])
y_train = torch.from_numpy(const.y_train)

X_cv = torch.stack([torch.tensor(i) for i in const.X_cv])
y_cv = torch.from_numpy(const.y_cv)

X_test = torch.stack([torch.tensor(i) for i in const.X_test])
y_test = torch.from_numpy(const.y_test)

training_set = data.TensorDataset(X_train, y_train)
def applyMachineLearning(class1,
                         class2,
                         randomSeed,
                         arff_path,
                         trainingSetFilename,
                         testSetFilename=None,
                         setPath=CORPUS_PATH):
    """
    Uses machine learning approach to classify sentences.
    """

    no_configs = [("irony", "figurative"), ("irony", "irony"),
                  ("sarcasm", "sarcasm"), ("sarcasm", "figurative"),
                  ("regular", "regular"), ("figurative", "irony"),
                  ("figurative", "sarcasm"), ("figurative", "figurative")]
    if (class1, class2) in no_configs:
        print "ERROR! Please use allowed combination of classes!"
        exit()

    ## ---------- feature configurations --------------------------------
    featureConfigs = []

    bowConfig = []
    bowBigramConfig = []

    allBinaryConfig = range(18, 40)
    allConfig = range(60)
    allConfig.extend(range(156, 157))
    allWithoutNumbers = range(18, 60)
    allWithoutNorm = range(40)
    allWithoutStacks = range(60)

    normConfig = range(40, 60)
    normConfig.extend(range(156, 157))
    numbersConfig = range(18)
    stacksConfig = range(60, 156)

    allWithoutBinary = range(18)
    allWithoutBinary.extend(range(40, 60))

    allWithoutBoW = range(60)
    allWithoutBowBigram = range(60)
    allWithoutBigrams = range(60)

    # top10 - evaluated with weka chi^2-test
    if class1 == "irony" and class2 == "sarcasm":
        top10Config = [43, 57, 52, 156, 44, 56, 51, 42, 50, 49]
    elif class1 == "irony" and class2 == "regular":
        top10Config = [35, 54, 45, 56, 50, 42, 52, 44]
    elif class1 == "sarcasm" and class2 == "regular":
        top10Config = [35, 54, 45, 50, 52, 42, 57, 56]
    else:
        top10Config = []

    # ablation study 1
    sentimentConfig = [
        0, 5, 42, 43, 44, 18, 20, 21, 22, 23, 24, 25, 26, 28, 31, 34, 36, 70,
        71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 155
    ]  # number = [0,5], norm = [42,43,44], binaer = [18,20,21,22,23,24,25,26,28,31,34,36], stack = [70,71,72,73,74,75,76,78,79,80,81,82,83,84,155]

    subjConfig = [
        4, 59, 150, 151, 152, 153, 154
    ]  # number = [4], norm = [59], binaer = [], stack = [150,151,152,153,154]

    syntaxConfig = [
        1, 13, 16, 52, 56, 156, 29, 115, 116, 117, 118, 119, 135, 136, 137,
        138, 139
    ]  # number = [1,13,16], norm = [52,56,156], binaer = [29], stack = [115,116,117,118,119,135,136,137,138,139]

    posConfig = [
        6, 7, 11, 17, 45, 50, 51, 57, 85, 86, 87, 88, 89, 105, 106, 107, 108,
        109, 110, 111, 112, 113, 114, 140, 141, 142, 143, 144
    ]  # number = [6,7,11,17], norm = [45,50,51,57], binaer = [], stack = [85,86,87,88,89,105,106,107,108,109,110,111,112,113,114,140,141,142,143,144]

    emoticonConfig = [
        2, 10, 12, 14, 15, 41, 46, 48, 53, 55, 58, 30, 32, 37, 65, 66, 67, 68,
        69, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 120, 121, 122, 123, 124,
        130, 131, 132, 133, 134, 145, 146, 147, 148, 149
    ]

    urlAndUserConfig = [
        39, 35
    ]  # number = [], norm = [], binaer = [35,39], stack = []

    signalConfig = [
        3, 8, 9, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 54, 19, 27, 33, 38,
        60, 61, 62, 63, 64, 100, 101, 102, 103, 104, 125, 126, 127, 128, 129
    ]  # number = [3,8,9], norm = [40,41,42,43,44,45,46,47,48,49,54], binaer = [19,27,33,38], stack = [60,61,62,63,64,100,101,102,103,104,125,126,127,128,129]

    signalGroupConfig = signalConfig
    signalGroupConfig.extend(urlAndUserConfig)
    signalGroupConfig.extend(emoticonConfig)

    syntaxGroupConfig = syntaxConfig
    syntaxGroupConfig.extend(subjConfig)
    syntaxGroupConfig.extend(posConfig)

    syntaxAndSentiment = syntaxGroupConfig
    syntaxAndSentiment.extend(sentimentConfig)

    syntaxAndSignal = syntaxGroupConfig
    syntaxAndSignal.extend(signalGroupConfig)

    sentimentAndSignal = signalGroupConfig
    sentimentAndSignal.extend(sentimentConfig)

    # ablation study 2: ablation from ALL
    allWithoutSentiment = list(set(range(157)) - set(sentimentConfig))

    allWithoutPOS = list(set(range(157)) - set(posConfig))

    # signal + emoticons + url&User
    allWithoutSignal = list(
        set(range(157)) - set(signalConfig) - set(emoticonConfig) -
        set(urlAndUserConfig))

    # Syntax # Subjectivity
    allWithoutSyntax = list(
        set(range(157)) - set(syntaxConfig) - set(subjConfig) - set(posConfig))

    allWithoutTop10 = list(set(allConfig) - set(top10Config))

    # ablation study 2: ablation from BINARY
    binaryWithoutSentiment = list(set(allBinaryConfig) - set(sentimentConfig))

    binaryWithoutPOS = list(set(allBinaryConfig) - set(posConfig))

    # New Signal: Old Signal + Emoticon and RegExp and URL and User
    binaryWithoutSignal = list(
        set(allBinaryConfig) - set(signalConfig) - set(emoticonConfig) -
        set(urlAndUserConfig))

    # New Syntax: Old Syntax + Subj:

    binaryWithoutSyntax = list(
        set(allBinaryConfig) - set(syntaxConfig) - set(subjConfig) -
        set(posConfig))

    binaryWithoutTop10 = list(set(allBinaryConfig) - set(top10Config))

    # full configuration of feature list for feature extraction
    #  mode,feat,regEx,stack_binning,bigram,sentiment,configuration
    #     featureConfigs.append(("bow",True,True,True,False,False,bowConfig,"features_bowConfig"))
    #     featureConfigs.append(("bow",True,True,True,True,False,bowBigramConfig,"features_bowBigramConfig"))
    #     featureConfigs.append(("all",True,True,False,True,True,allBinaryConfig,"features_allBinaryConfig"))
    #     featureConfigs.append(("all",True,True,True,True,True,allConfig,"features_allConfig"))
    #     featureConfigs.append(("all",True,True,True,True,True,allWithoutNumbers,"features_allWithoutNumbers"))
    #     featureConfigs.append(("all",True,True,True,True,True,allWithoutNorm,"features_allWithoutNorm"))
    #     featureConfigs.append(("all",True,True,False,True,True,allWithoutStacks,"features_allWithoutStacks"))
    #     featureConfigs.append(("all",True,False,True,True,True,allWithoutBinary,"features_allWithoutBinary"))
    #     featureConfigs.append(("specific",True,True,True,True,True,allWithoutBoW,"features_allWithoutBoW"))
    featureConfigs.append(
        ("specific", True, True, True, False, True, allWithoutBowBigram,
         "features_allWithoutBowBigram"))
    #     featureConfigs.append(("all",True,True,True,False,True,allWithoutBigrams,"features_allWithoutBigrams"))

    #     featureConfigs.append(("all",True,False,False,True,False,numbersConfig, "features_numbersConfig"))
    #     featureConfigs.append(("all",True,False,False,True,False,normConfig, "features_normConfig"))
    #     featureConfigs.append(("all",True,False,False,True,False,stacksConfig, "features_stacksConfig"))

    #     featureConfigs.append(("all",True,True,False,True,True,allWithoutEmoticons,"features_allWithoutEmoticons"))
    #     featureConfigs.append(("all",True,True,False,True,True,allWithoutSubj,"features_allWithoutSubj"))
    #     featureConfigs.append(("all",True,False,False,True,True,allWithoutRegExpAndURLandUser,"features_allWithoutRegExpAndURLandUser"))

    #     featureConfigs.append(("specific",True,True,False,True,True,allBinaryConfig,"features_binaryWithoutBoW"))
    #     featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBoWBi"))
    #     featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBi"))
    #
    #     featureConfigs.append(("all",True,True,False,True,False,binaryWithoutSentiment,"features_binaryWithoutSentiment"))
    #     featureConfigs.append(("all",True,True,False,True,True,binaryWithoutPOS,"features_binaryWithoutPOS"))
    #     featureConfigs.append(("all",True,False,False,True,True,binaryWithoutSignal,"features_binaryWithoutSignal"))
    #     featureConfigs.append(("all",True,True,False,True,True,binaryWithoutSyntax,"features_binaryWithoutSyntax"))
    #     featureConfigs.append(("all",True,True,False,True,False,binaryWithoutWeka,"features_binaryWithoutWeka"))
    #     featureConfigs.append(("all",True,True,False,True,True,binaryWithoutTop10,"features_binaryWithoutTop10"))

    #     featureConfigs.append(("all",True,True,False,True,False,allWithoutSentiment,"features_allWithoutSentiment"))
    #     featureConfigs.append(("all",True,True,False,True,True,allWithoutPOS,"features_allWithoutPOS"))
    #     featureConfigs.append(("all",True,False,False,True,True,allWithoutSignal,"features_allWithoutSignal"))
    #     featureConfigs.append(("all",True,True,False,True,True,allWithoutSyntax,"features_allWithoutSyntax"))
    #     featureConfigs.append(("all",True,True,False,True,False,allWithoutWeka,"features_allWithoutWeka"))
    #     featureConfigs.append(("all",True,True,False,True,True,allWithoutTop10,"features_allWithoutTop10"))

    # feature categories:
    #     featureConfigs.append(("all",True,False,False,True,True,sentimentConfig, "features_sentimentConfig"))
    #     featureConfigs.append(("all",True,False,False,True,False,posConfig, "features_posConfig"))
    #     featureConfigs.append(("all",True,True,False,True,False,signalGroupConfig, "features_signalGroupConfig"))
    #     featureConfigs.append(("all",True,False,False,True,False,syntaxGroupConfig, "features_syntaxGroupConfig"))
    #
    #     featureConfigs.append(("specific",True,False,False,False,True,sentimentConfig, "features_sentimentConfig_specific"))
    #     featureConfigs.append(("specific",True,False,False,False,False,posConfig, "features_posConfig_specific"))
    #     featureConfigs.append(("specific",True,True,False,False,False,signalGroupConfig, "features_signalGroupConfig_specific"))
    #     featureConfigs.append(("specific",True,False,False,False,False,syntaxGroupConfig, "features_syntaxGroupConfig_specific"))

    # Combinations:
    #     featureConfigs.append(("specific",True,True,False,False,True,sentimentAndSignal,"features_sentimentAndSignal"))
    #     featureConfigs.append(("specific",True,False,False,False,True,syntaxAndSentiment,"features_syntaxAndSentiment"))
    #     featureConfigs.append(("specific",True,True,False,False,False,syntaxAndSignal,"features_syntaxAndSignal"))

    #     featureConfigs.append(("all",True,False,False,True,False,top10Config, "features_top10Config"))

    print str(len(featureConfigs)) + " different configurations of features"

    # create file which contains status reports.
    with open("info.txt", "a") as info:
        info.write("Start" + "\n")
        print "Start"
        # TODO: Add condition to create corpus, if no file exists.
        info.write("Training the classifiers using the set at '{path}{file}'".
                   format(path=setPath, file=trainingSetFilename) + "\n")
        print(
            "Training the classifiers using the set at '{path}{file}'".format(
                path=setPath, file=trainingSetFilename))

        lt = localtime()
        info.write("Begin loading Corpus " + class1 + " vs " + class2 + " - " +
                   str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) +
                   "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." +
                   str(lt[0]) + "\n")
        print("Begin loading Corpus " + class1 + " vs " + class2 + " - " +
              str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) +
              "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0]))

        # load training corpus.
        trainingSet = Corpus(class1,
                             class2,
                             trainingSetFilename,
                             corpusPath=CORPUS_PATH + TRAIN_PATH)

        # Get the ids - which are ordered class1, class2 and shuffle them.
        trainingIDs = trainingSet.tweetIDs
        random.seed(randomSeed)
        random.shuffle(trainingIDs)

        # load test corpus if filename is given; not needed for cross validation.
        if not testSetFilename == None:
            testSet = Corpus(class1,
                             class2,
                             testSetFilename,
                             corpusPath=CORPUS_PATH + TEST_PATH)
            tweets = dict(trainingSet.tweets.items() + testSet.tweets.items())

            mode_list = []
            bigram_list = []

            # only create dict for bag-of-words and bigrams if really necessary!
            for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs:
                mode_list.append(mode)
                bigram_list.append(bigram)

            if "all" in mode_list:
                bowDictionary = createBagOfWordsDictionary(tweets)
                print "loaded bow"
            elif "bow" in mode_list:
                bowDictionary = createBagOfWordsDictionary(tweets)
                print "loaded bow"
            else:
                bowDictionary = {}
                print "bow not necessary"

            if True in bigram_list:
                bigramDictionary = createBagOfBigramsDictionary(tweets)
                print "loaded bigrams"
            else:
                bigramDictionary = {}
                print "bigrams not necessary"

        else:
            bowDictionary = None
            bigramDictionary = None

        lt = localtime()
        info.write("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" +
                   str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) +
                   "." + str(lt[0]) + "\n")
        print("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" +
              str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." +
              str(lt[0]))

        info.write("Extracting features with different configurations \n")
        print("Extracting features with different configurations")

    t = 0

    # feature extraction using above feature configurations.
    for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs:

        trainFeatures = []
        trainFeatureVectors = {}
        testFeatures = []
        testFeatureVectors = {}

        t += 1

        config_name = createARFF + "_" + class1 + "_vs_" + class2
        # if empty string no arff file will be generated
        # else set createARFF_file = createARFF
        createARFF_file = createARFF

        with open("info_" + config_name + ".txt", "a") as info:
            print "\n" + str(
                t
            ) + "th configuration\n-----------------------------------------\n"
            info.write(
                "\n" + str(t) +
                "th configuration\n-----------------------------------------\n"
            )

        # optional: if true, then all binary combinations of all
        # features are added to feature list.
        binary_combination = False

        # feature extraction.
        trainFeatures, trainFeatureVectors = extractFeatures(
            class1, class2, mode, arff_path, trainingIDs, trainingSet.tweets,
            config, feat, regExp, stack_binning, binary_combination, sentiment,
            bigram, createARFF_file, bowDictionary, bigramDictionary)

        # array of train data - is not necessary; just used for safeguard.
        tTargets = []
        tData = []

        # sparse matrix of train data:
        ID_map_train = {}
        rdim = len(trainFeatureVectors.keys())
        cdim = len(trainFeatureVectors[trainingIDs[0]])

        # create sparse matrix with rdim x cdim
        trainData = lil_matrix((rdim, cdim))

        trainGold = trainingSet.goldStandard

        trainTargets = range(len(trainGold))
        j = 0
        for ID, g in trainGold.items():

            ID_map_train[j] = ID

            # array part.
            tTargets.append(g)
            tData.append(trainFeatureVectors[ID])

            # matrix will be filled.
            for i in range(len(trainFeatureVectors[ID])):
                if trainFeatureVectors[ID][i] != 0:
                    trainData[j, i] = trainFeatureVectors[ID][i]
                    trainTargets[j] = g

            j += 1

        trainFeatureVectors = {}
        trainGold = {}

        classifiers = [(DecisionTreeClassifier(), "Decision_Tree"),
                       (SVC(kernel="linear"), "Linear_SVC"), (SVC(), "SVC"),
                       (LinearSVC(), "LinearSVC"),
                       (LogisticRegression(), "logRegression")]

        # classifiers which need matrix
        matrixClassifier = ["Linear_SVC", "SVC", "LinearSVC", "logRegression"]

        # Cross validation
        if testSetFilename == None:
            for c, name in classifiers:
                if name in matrixClassifier:
                    if isspmatrix(trainData):
                        duration = timeit(lambda: applyCrossValidation(
                            class1, class2, createARFF, ID_map_train, c, name,
                            trainData, trainTargets, 10),
                                          number=1)
                        showDuration(createARFF, name, duration)

                else:
                    duration = timeit(lambda: applyCrossValidation(
                        class1, class2, createARFF, ID_map_train, c, name,
                        tData, tTargets, 10),
                                      number=1)
                    showDuration(createARFF, name, duration)

        # use test data for evaluation.
        else:
            with open("info.txt", "a") as info:
                info.write(
                    "Testing the classifiers using the set at '{path}{file}'".
                    format(path=CORPUS_PATH, file=testSetFilename) + "\n")
                print(
                    "Testing the classifiers using the set at '{path}{file}'".
                    format(path=CORPUS_PATH, file=testSetFilename))

                info.write("Extracting features... \n")
                testIDs = testSet.tweetIDs

                random.seed(randomSeed)
                random.shuffle(testIDs)

                # feature extraction for test data.
                testFeatures, testFeatureVectors = extractFeatures(
                    class1, class2, mode, arff_path, testIDs, testSet.tweets,
                    config, feat, regExp, stack_binning, binary_combination,
                    sentiment, bigram, createARFF_file, bowDictionary,
                    bigramDictionary)

                # array of test data:
                tsTargets = []
                tsData = []

                # sparse matrix of test data
                rdim = len(testFeatureVectors.keys())
                cdim = len(testFeatureVectors[testIDs[0]])
                testData = lil_matrix((rdim, cdim))

                testGold = testSet.goldStandard

                testTargets = range(len(testGold))

                ID_map_test = {}
                j = 0

                for ID, g in testGold.items():

                    ID_map_test[j] = ID

                    # array
                    tsTargets.append(g)
                    tsData.append(testFeatureVectors[ID])

                    # matrix
                    for i in range(len(testFeatureVectors[ID])):
                        if testFeatureVectors[ID][i] != 0:
                            testData[j, i] = testFeatureVectors[ID][i]
                            testTargets[j] = g

                    j += 1

                testFeatureVectors = {}
                testGold = {}

                for c, name in classifiers:
                    if name in matrixClassifier:
                        duration = timeit(lambda: applyClassifier(
                            class1, class2, createARFF, ID_map_test, c, name,
                            trainData, trainTargets, testData, testTargets),
                                          number=1)
                        showDuration(createARFF, name, duration)

                    else:
                        duration = timeit(lambda: applyClassifier(
                            class1, class2, createARFF, ID_map_test, c, name,
                            tData, tTargets, tsData, tsTargets),
                                          number=1)
                        showDuration(createARFF, name, duration)
예제 #18
0
def applyML2(trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH):
    """
    Uses machine learning approach to classify sentences.
    Implements a truly simple 'Leave One Out' function.
    """
    # TODO: Add condition to create corpus, if no file exists.
    print("Training the classifiers using the set at '{path}{file}'".format(
                                                    path=setPath,
                                                    file=trainingSetFilename))

    #trainingSet = Corpus(trainingSetFilename, corpusPath=CORPUS_PATH)
    # trainingSet = Corpus.loadCorpus(filename=trainingSetFilename)
    # trainingSet = Corpus.loadCorpus(filename="training_and_validation_set.pk")
    trainingSet = Corpus.loadCorpus(filename="shuffled_set.pk")


    # for each in trainingSet.reviewIDs[0:10]:
    #     print(each)
    # print()

    # Get the ids - which are ordered ironic, regular - and shuffle them.
    ids = trainingSet.reviewIDs
    random.seed(44)
    random.shuffle(ids)
    # for each in ids[0:10]:
    #     print(each)
    # print()

    # Falls das -new flag nicht gesetzt ist ODER es keine Datei zum laden gibt,
    # erstelle den Corpus neu.


    print("Extracting features...")
#    trainFeatures, trainFeatureVectors = extractFeatures(trainingSet.reviewIDs,
#                                                trainingSet.reviews)


    featureConfig = {
        "minus Imba": { u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Quotes": {u"Positive Imbalance": (u"w-\u2605 ",
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ",
                            negStarPolarityDiscrepancy),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Pos/Neg&Punctuation": {u"Positive Imbalance": (u"w-\u2605 ",
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ",
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Hyperbole": {u"Positive Imbalance": (u"w-\u2605 ",
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ",
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Ellipsis and Punctuation": {u"Positive Imbalance": (u"w-\u2605 ",
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ",
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Pos/Neg&Ellipsis": {u"Positive Imbalance": (u"w-\u2605 ",
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ",
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
        },
        "minus Pos": {  u"Negative Imbalance": (u"w+\u2606 ",
                            negStarPolarityDiscrepancy),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Neg": {u"Positive Imbalance": (u"w-\u2605 ",
                            posStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
    }



    for name, config in featureConfig.items():
        print("\n"*5, name)
        print("-"*60)
        for each in config:
            print(each)
        print()



        trainFeatures, trainFeatureVectors = extractFeatures(ids,
                                                trainingSet.reviews, config)

        trainTargets = []
        trainData = []

        trainGold = trainingSet.goldStandard
        for ID, g in trainGold.items():
            trainTargets.append(g)
            trainData.append(trainFeatureVectors[ID])
        #for i, vec in enumerate(data):
        #    print(targets[i], " | ", vec)

        featureCount = sum([sum(v) for v in trainData])
        print("Feature found: ", featureCount, "times.")


        classifiers = [DecisionTreeClassifier(),
                        SVC(kernel="linear"),
                        SVC(),
                        LinearSVC(),
                        MultinomialNB(),
                        GaussianNB(),
                        RandomForestClassifier(),
                        LogisticRegression(),]

        # Cross validation
        if testSetFilename == None:
            for c in classifiers:
                applyCrossValidation(c, trainData, trainTargets)

            # scores = cross_validation.cross_val_score(classifier, array(data),
            #                                         array(targets), cv=10)
            # print(scores)

        else:
            print("Testing the classifiers using the set at '{path}{file}'".format(
                                                        path=CORPUS_PATH,
                                                        file=testSetFilename))

            testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH)
            # testSet = Corpus.loadCorpus(filename="test_set.pk")

            print("Extracting features...")
            testFeatures, testFeatureVectors = extractFeatures(testSet.reviewIDs,
                                                    testSet.reviews)

            testData = []
            testTargets = []

            testGold = testSet.goldStandard
            for ID, g in testGold.items():
                testTargets.append(g)
                testData.append(testFeatureVectors[ID])

            for c in classifiers:
                applyClassifier(c, trainData, trainTargets, testData, testTargets)
예제 #19
0
def showFeatures(IDsFilename=REVIEW_IDS_FILENAME):
    corpus = Corpus(IDsFilename)
    features, featureVectors = extractFeatures(corpus.reviewIDs, corpus.reviews, features=None)

    showFeatureOccurrence(features, featureVectors)
예제 #20
0
def exportFeatures():
    corpus = Corpus(SET_FILENAMES[3])
    features, featureVectors = extractFeatures(corpus.reviewIDs, corpus.reviews, features=None, createARFF=True)
예제 #21
0
def applyML(trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH):
    """
    Uses machine learning approach to classify sentences.

    """
    # TODO: Add condition to create corpus, if no file exists.
    print("Training the classifiers using the set at '{path}{file}'".format(
                                                    path=setPath,
                                                    file=trainingSetFilename))

    trainingSet = Corpus(trainingSetFilename, corpusPath=CORPUS_PATH)

    # trainingSet = Corpus.loadCorpus(filename="shuffled_set.pk")

    # for each in trainingSet.reviewIDs[0:10]:
    #     print(each)
    # print()

    # Get the ids - which are ordered ironic, regular - and shuffle them.
    ids = trainingSet.reviewIDs
    random.seed(44)
    random.shuffle(ids)
    # for each in ids[0:10]:
    #     print(each)
    # print()

    reviews = trainingSet.reviews

    if not testSetFilename == None:
        testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH)
        reviews = dict(trainingSet.reviews.items() + testSet.reviews.items())
        bowDictionary = createBagOfWordsDictionary(reviews)
    else:
        bowDictionary = None

    print("Extracting features...")
    trainFeatures, trainFeatureVectors = extractFeatures(ids,
                                            trainingSet.reviews,
                                            bowDictionary=bowDictionary)

    trainTargets = []
    trainData = []
    stars = []

    trainGold = trainingSet.goldStandard
    for ID, g in trainGold.items():
        trainTargets.append(g)
        trainData.append(trainFeatureVectors[ID])
        stars.append(trainingSet.reviews[ID].stars)
    #for i, vec in enumerate(data):
    #    print(targets[i], " | ", vec)

    featureCount = sum([sum(v) for v in trainData])
    # print("Feature found: ", featureCount, "times.")

    trainTargets = array(trainTargets)
    trainData = array(trainData)


    classifiers = [DecisionTreeClassifier(),
                    SVC(kernel="linear"),
                    SVC(),
                    LinearSVC(),
                    MultinomialNB(),
                    GaussianNB(),
                    RandomForestClassifier(),
                    LogisticRegression(),MLPClassifier(hidden_layer_sizes=(15,), random_state=1, max_iter=1, warm_start=True)]

    # Cross validation
    if testSetFilename == None:
        for c in classifiers:
            applyCrossValidation(c, trainData, trainTargets)

            # Show star distribution for each classifier
            # applyCrossValidation(c, trainData, trainTargets, stars=stars)

        # scores = cross_validation.cross_val_score(classifier, array(data),
        #                                         array(targets), cv=10)
        # print(scores)

    else:
        print("Testing the classifiers using the set at '{path}{file}'".format(
                                                    path=CORPUS_PATH,
                                                    file=testSetFilename))

        # testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH)
        # testSet = Corpus.loadCorpus(filename="test_set.pk")

        # Create bag of words dictionary that contains words of all reviews
        # bowDictionary = createBagOfWordsDictionary(
        #                     trainingSet.reviews + testSet.reviews)


        print("Extracting features...")
        testFeatures, testFeatureVectors = extractFeatures(testSet.reviewIDs,
                                                    testSet.reviews,
                                                    bowDictionary=bowDictionary)

        testData = []
        testTargets = []

        testGold = testSet.goldStandard
        for ID, g in testGold.items():
            testTargets.append(g)
            testData.append(testFeatureVectors[ID])

        testData = array(testData)
        testTargets = array(testTargets)

        for c in classifiers:
            applyClassifier(c, trainData, trainTargets, testData, testTargets)
def applyMachineLearning(class1, class2, randomSeed, arff_path, trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH):
    """
    Uses machine learning approach to classify sentences.
    """
    
    no_configs = [("irony", "figurative"), ("irony", "irony"), ("sarcasm", "sarcasm"), ("sarcasm", "figurative"), ("regular", "regular"), ("figurative", "irony"), ("figurative", "sarcasm"), ("figurative", "figurative")]
    if (class1, class2) in no_configs:
        print "ERROR! Please use allowed combination of classes!"
        exit()
    
    ## ---------- feature configurations --------------------------------        
    featureConfigs = []
    
    bowConfig = []
    bowBigramConfig = []
    
    allBinaryConfig = range(18,40)
    allConfig = range(60)
    allConfig.extend(range(156,157))
    allWithoutNumbers = range(18,60)
    allWithoutNorm = range(40)
    allWithoutStacks = range(60)
    
    normConfig = range(40,60)
    normConfig.extend(range(156,157))
    numbersConfig = range(18)
    stacksConfig = range(60,156)
    
    allWithoutBinary = range(18)
    allWithoutBinary.extend(range(40,60))
        
    allWithoutBoW = range(60)
    allWithoutBowBigram = range(60)
    allWithoutBigrams = range(60)
    
    # top10 - evaluated with weka chi^2-test
    if class1 == "irony" and class2 == "sarcasm":    
        top10Config = [43,57,52,156,44,56,51,42,50,49]
    elif class1 == "irony" and class2 == "regular":
        top10Config = [35,54,45,56,50,42,52,44]
    elif class1 == "sarcasm" and class2 == "regular":
        top10Config = [35,54,45,50,52,42,57,56]
    else:
        top10Config = []
    
    # ablation study 1
    sentimentConfig = [0,5,42,43,44,18,20,21,22,23,24,25,26,28,31,34,36,70,71,72,73,74,75,76,78,79,80,81,82,83,84,155] # number = [0,5], norm = [42,43,44], binaer = [18,20,21,22,23,24,25,26,28,31,34,36], stack = [70,71,72,73,74,75,76,78,79,80,81,82,83,84,155]
    
    subjConfig = [4,59,150,151,152,153,154] # number = [4], norm = [59], binaer = [], stack = [150,151,152,153,154]
    
    syntaxConfig =  [1,13,16,52,56,156,29,115,116,117,118,119,135,136,137,138,139] # number = [1,13,16], norm = [52,56,156], binaer = [29], stack = [115,116,117,118,119,135,136,137,138,139]
    
    posConfig = [6,7,11,17,45,50,51,57,85,86,87,88,89,105,106,107,108,109,110,111,112,113,114,140,141,142,143,144] # number = [6,7,11,17], norm = [45,50,51,57], binaer = [], stack = [85,86,87,88,89,105,106,107,108,109,110,111,112,113,114,140,141,142,143,144]
    
    emoticonConfig = [2,10,12,14,15,41,46,48,53,55,58,30,32,37,65,66,67,68,69,90,91,92,93,94,95,96,97,98,99,120,121,122,123,124,130,131,132,133,134,145,146,147,148,149]
    
    urlAndUserConfig = [39,35] # number = [], norm = [], binaer = [35,39], stack = []
    
    signalConfig = [3,8,9,40,41,42,43,44,45,46,47,48,49,54,19,27,33,38,60,61,62,63,64,100,101,102,103,104,125,126,127,128,129] # number = [3,8,9], norm = [40,41,42,43,44,45,46,47,48,49,54], binaer = [19,27,33,38], stack = [60,61,62,63,64,100,101,102,103,104,125,126,127,128,129]
    
    signalGroupConfig = signalConfig
    signalGroupConfig.extend(urlAndUserConfig)
    signalGroupConfig.extend(emoticonConfig)
    
    syntaxGroupConfig = syntaxConfig
    syntaxGroupConfig.extend(subjConfig)
    syntaxGroupConfig.extend(posConfig)
    
    syntaxAndSentiment = syntaxGroupConfig
    syntaxAndSentiment.extend(sentimentConfig)
    
    syntaxAndSignal = syntaxGroupConfig
    syntaxAndSignal.extend(signalGroupConfig)
    
    sentimentAndSignal = signalGroupConfig
    sentimentAndSignal.extend(sentimentConfig)
    
    
    # ablation study 2: ablation from ALL
    allWithoutSentiment = list(set(range(157)) - set(sentimentConfig)) 
    
    allWithoutPOS = list(set(range(157)) - set(posConfig))
    
    # signal + emoticons + url&User
    allWithoutSignal = list(set(range(157)) - set(signalConfig) - set(emoticonConfig) - set(urlAndUserConfig))

    # Syntax # Subjectivity
    allWithoutSyntax = list(set(range(157)) - set(syntaxConfig) - set(subjConfig) - set(posConfig))
        
    allWithoutTop10 = list(set(allConfig) - set(top10Config))
    

    # ablation study 2: ablation from BINARY
    binaryWithoutSentiment = list(set(allBinaryConfig) - set(sentimentConfig))
    
    binaryWithoutPOS = list(set(allBinaryConfig) - set(posConfig))
    
    # New Signal: Old Signal + Emoticon and RegExp and URL and User
    binaryWithoutSignal = list(set(allBinaryConfig) - set(signalConfig) - set(emoticonConfig) - set(urlAndUserConfig))
        
    # New Syntax: Old Syntax + Subj:

    binaryWithoutSyntax = list(set(allBinaryConfig) - set(syntaxConfig) - set(subjConfig) - set(posConfig))
       
    binaryWithoutTop10 = list(set(allBinaryConfig) - set(top10Config))
    
    # full configuration of feature list for feature extraction
                        #  mode,feat,regEx,stack_binning,bigram,sentiment,configuration
#     featureConfigs.append(("bow",True,True,True,False,False,bowConfig,"features_bowConfig"))
#     featureConfigs.append(("bow",True,True,True,True,False,bowBigramConfig,"features_bowBigramConfig"))
#     featureConfigs.append(("all",True,True,False,True,True,allBinaryConfig,"features_allBinaryConfig"))
#     featureConfigs.append(("all",True,True,True,True,True,allConfig,"features_allConfig"))
#     featureConfigs.append(("all",True,True,True,True,True,allWithoutNumbers,"features_allWithoutNumbers"))
#     featureConfigs.append(("all",True,True,True,True,True,allWithoutNorm,"features_allWithoutNorm"))
#     featureConfigs.append(("all",True,True,False,True,True,allWithoutStacks,"features_allWithoutStacks"))
#     featureConfigs.append(("all",True,False,True,True,True,allWithoutBinary,"features_allWithoutBinary"))
#     featureConfigs.append(("specific",True,True,True,True,True,allWithoutBoW,"features_allWithoutBoW"))
    featureConfigs.append(("specific",True,True,True,False,True,allWithoutBowBigram,"features_allWithoutBowBigram"))
#     featureConfigs.append(("all",True,True,True,False,True,allWithoutBigrams,"features_allWithoutBigrams"))

#     featureConfigs.append(("all",True,False,False,True,False,numbersConfig, "features_numbersConfig"))
#     featureConfigs.append(("all",True,False,False,True,False,normConfig, "features_normConfig"))
#     featureConfigs.append(("all",True,False,False,True,False,stacksConfig, "features_stacksConfig"))

#     featureConfigs.append(("all",True,True,False,True,True,allWithoutEmoticons,"features_allWithoutEmoticons"))
#     featureConfigs.append(("all",True,True,False,True,True,allWithoutSubj,"features_allWithoutSubj"))
#     featureConfigs.append(("all",True,False,False,True,True,allWithoutRegExpAndURLandUser,"features_allWithoutRegExpAndURLandUser"))

#     featureConfigs.append(("specific",True,True,False,True,True,allBinaryConfig,"features_binaryWithoutBoW"))
#     featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBoWBi"))
#     featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBi"))
#     
#     featureConfigs.append(("all",True,True,False,True,False,binaryWithoutSentiment,"features_binaryWithoutSentiment"))
#     featureConfigs.append(("all",True,True,False,True,True,binaryWithoutPOS,"features_binaryWithoutPOS"))
#     featureConfigs.append(("all",True,False,False,True,True,binaryWithoutSignal,"features_binaryWithoutSignal"))
#     featureConfigs.append(("all",True,True,False,True,True,binaryWithoutSyntax,"features_binaryWithoutSyntax"))
#     featureConfigs.append(("all",True,True,False,True,False,binaryWithoutWeka,"features_binaryWithoutWeka"))
#     featureConfigs.append(("all",True,True,False,True,True,binaryWithoutTop10,"features_binaryWithoutTop10"))

#     featureConfigs.append(("all",True,True,False,True,False,allWithoutSentiment,"features_allWithoutSentiment"))
#     featureConfigs.append(("all",True,True,False,True,True,allWithoutPOS,"features_allWithoutPOS"))
#     featureConfigs.append(("all",True,False,False,True,True,allWithoutSignal,"features_allWithoutSignal"))
#     featureConfigs.append(("all",True,True,False,True,True,allWithoutSyntax,"features_allWithoutSyntax"))
#     featureConfigs.append(("all",True,True,False,True,False,allWithoutWeka,"features_allWithoutWeka"))
#     featureConfigs.append(("all",True,True,False,True,True,allWithoutTop10,"features_allWithoutTop10"))

    # feature categories:
#     featureConfigs.append(("all",True,False,False,True,True,sentimentConfig, "features_sentimentConfig"))
#     featureConfigs.append(("all",True,False,False,True,False,posConfig, "features_posConfig"))
#     featureConfigs.append(("all",True,True,False,True,False,signalGroupConfig, "features_signalGroupConfig"))
#     featureConfigs.append(("all",True,False,False,True,False,syntaxGroupConfig, "features_syntaxGroupConfig"))
#      
#     featureConfigs.append(("specific",True,False,False,False,True,sentimentConfig, "features_sentimentConfig_specific"))
#     featureConfigs.append(("specific",True,False,False,False,False,posConfig, "features_posConfig_specific"))
#     featureConfigs.append(("specific",True,True,False,False,False,signalGroupConfig, "features_signalGroupConfig_specific"))
#     featureConfigs.append(("specific",True,False,False,False,False,syntaxGroupConfig, "features_syntaxGroupConfig_specific"))

    # Combinations:
#     featureConfigs.append(("specific",True,True,False,False,True,sentimentAndSignal,"features_sentimentAndSignal"))
#     featureConfigs.append(("specific",True,False,False,False,True,syntaxAndSentiment,"features_syntaxAndSentiment"))
#     featureConfigs.append(("specific",True,True,False,False,False,syntaxAndSignal,"features_syntaxAndSignal"))

#     featureConfigs.append(("all",True,False,False,True,False,top10Config, "features_top10Config"))
   
    print str(len(featureConfigs)) + " different configurations of features"   
    
    # create file which contains status reports.
    with open("info.txt", "a") as info:
        info.write("Start" + "\n")
        print "Start"
        # TODO: Add condition to create corpus, if no file exists.
        info.write("Training the classifiers using the set at '{path}{file}'".format(
                                                        path=setPath,
                                                        file=trainingSetFilename) + "\n")
        print("Training the classifiers using the set at '{path}{file}'".format(
                                                        path=setPath,
                                                        file=trainingSetFilename))
        
        
        lt = localtime()
        info.write("Begin loading Corpus " + class1 + " vs " + class2 + " - " + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0]) + "\n")
        print("Begin loading Corpus " + class1 + " vs " + class2 + " - " + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0]))
        
        # load training corpus.
        trainingSet = Corpus(class1, class2, trainingSetFilename, corpusPath=CORPUS_PATH+TRAIN_PATH)
        
        
        # Get the ids - which are ordered class1, class2 and shuffle them.
        trainingIDs = trainingSet.tweetIDs          
        random.seed(randomSeed)
        random.shuffle(trainingIDs)
        
        # load test corpus if filename is given; not needed for cross validation.
        if not testSetFilename == None:
            testSet = Corpus(class1, class2, testSetFilename, corpusPath=CORPUS_PATH+TEST_PATH)
            tweets = dict(trainingSet.tweets.items() + testSet.tweets.items())

            mode_list = []
            bigram_list = []
    
            # only create dict for bag-of-words and bigrams if really necessary!
            for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs:
                mode_list.append(mode)
                bigram_list.append(bigram)
    
            if "all" in mode_list:
                bowDictionary = createBagOfWordsDictionary(tweets)
                print "loaded bow"
            elif "bow" in mode_list:
                bowDictionary = createBagOfWordsDictionary(tweets)
                print "loaded bow"
            else:
                bowDictionary = {}
                print "bow not necessary"
    
            if True in bigram_list:
                bigramDictionary = createBagOfBigramsDictionary(tweets)
                print "loaded bigrams"
            else:
                bigramDictionary = {}
                print "bigrams not necessary"

        else:
            bowDictionary = None
            bigramDictionary = None
        
        lt = localtime()
        info.write("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0])+ "\n")
        print("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0]))

        info.write("Extracting features with different configurations \n")
        print("Extracting features with different configurations")
        

    
    t = 0
    
    # feature extraction using above feature configurations.
    for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs:
        
        trainFeatures = []
        trainFeatureVectors = {}
        testFeatures = []
        testFeatureVectors = {}
        
        t += 1
        
        config_name = createARFF + "_" + class1 + "_vs_" + class2
        # if empty string no arff file will be generated
        # else set createARFF_file = createARFF
        createARFF_file = createARFF
        
        with open("info_" + config_name + ".txt", "a") as info:
            print "\n" + str(t) + "th configuration\n-----------------------------------------\n"
            info.write("\n" + str(t) + "th configuration\n-----------------------------------------\n")
        
        # optional: if true, then all binary combinations of all 
        # features are added to feature list.        
        binary_combination=False       
        
        # feature extraction.    
        trainFeatures, trainFeatureVectors = extractFeatures(class1, class2, mode, arff_path, trainingIDs, trainingSet.tweets, config, feat, regExp, stack_binning, binary_combination, sentiment, bigram, createARFF_file, bowDictionary, bigramDictionary)
                
        # array of train data - is not necessary; just used for safeguard.
        tTargets = []
        tData = []
        
        # sparse matrix of train data:
        ID_map_train = {}
        rdim = len(trainFeatureVectors.keys())
        cdim = len(trainFeatureVectors[trainingIDs[0]])
        
        # create sparse matrix with rdim x cdim
        trainData = lil_matrix((rdim, cdim))
                    
        trainGold = trainingSet.goldStandard
        
        trainTargets = range(len(trainGold))
        j = 0
        for ID, g in trainGold.items():
            
            ID_map_train[j] = ID
            
            # array part.      
            tTargets.append(g)
            tData.append(trainFeatureVectors[ID])
            
            # matrix will be filled.
            for i in range(len(trainFeatureVectors[ID])):
                if trainFeatureVectors[ID][i] != 0:
                    trainData[j, i] = trainFeatureVectors[ID][i]
                    trainTargets[j] = g
            
            j += 1
        
        trainFeatureVectors = {}
        trainGold = {} 
                
        classifiers = [(DecisionTreeClassifier(), "Decision_Tree"), (SVC(kernel="linear"), "Linear_SVC"), (SVC(), "SVC"), (LinearSVC(), "LinearSVC"), (LogisticRegression(), "logRegression")]
        
        # classifiers which need matrix
        matrixClassifier = ["Linear_SVC", "SVC", "LinearSVC", "logRegression"]
        
        # Cross validation
        if testSetFilename == None:
            for c, name in classifiers:
                if name in matrixClassifier:
                    if isspmatrix(trainData):
                        duration = timeit(lambda: applyCrossValidation(class1, class2, createARFF, ID_map_train, c, name, trainData, trainTargets, 10), number=1)        
                        showDuration(createARFF, name, duration)

                else:
                    duration = timeit(lambda: applyCrossValidation(class1, class2, createARFF, ID_map_train, c, name, tData, tTargets, 10), number=1)                
                    showDuration(createARFF, name, duration)
                    
        # use test data for evaluation.
        else:
            with open("info.txt", "a") as info:   
                info.write("Testing the classifiers using the set at '{path}{file}'".format(
                                                            path=CORPUS_PATH,
                                                            file=testSetFilename) + "\n")
                print("Testing the classifiers using the set at '{path}{file}'".format(
                                                            path=CORPUS_PATH,
                                                            file=testSetFilename))
        
                info.write("Extracting features... \n")
                testIDs = testSet.tweetIDs
                
                random.seed(randomSeed)
                random.shuffle(testIDs)
                
                # feature extraction for test data.
                testFeatures, testFeatureVectors = extractFeatures(class1, class2, mode, arff_path, testIDs, testSet.tweets, config, feat, regExp, stack_binning, binary_combination, sentiment, bigram, createARFF_file, bowDictionary, bigramDictionary)
                                
                # array of test data:
                tsTargets = []
                tsData = []
                
                # sparse matrix of test data                   
                rdim = len(testFeatureVectors.keys())
                cdim = len(testFeatureVectors[testIDs[0]])
                testData = lil_matrix((rdim, cdim))
                
                testGold = testSet.goldStandard
                
                testTargets = range(len(testGold))
               
                ID_map_test = {}
                j = 0
                
                for ID, g in testGold.items():
                    
                    ID_map_test[j] = ID
                     
                    # array           
                    tsTargets.append(g)
                    tsData.append(testFeatureVectors[ID])
                    
                    # matrix
                    for i in range(len(testFeatureVectors[ID])):
                        if testFeatureVectors[ID][i] != 0:
                            testData[j, i] = testFeatureVectors[ID][i]
                            testTargets[j] = g
                    
                    j += 1
                
                testFeatureVectors ={}
                testGold = {}
         
                for c, name in classifiers:
                    if name in matrixClassifier:
                        duration = timeit(lambda: applyClassifier(class1, class2, createARFF, ID_map_test, c, name, trainData, trainTargets, testData, testTargets), number=1)
                        showDuration(createARFF, name, duration)
                        
                    else:
                        duration = timeit(lambda: applyClassifier(class1, class2, createARFF, ID_map_test, c, name, tData, tTargets, tsData, tsTargets), number=1)                   
                        showDuration(createARFF, name, duration)
예제 #23
0
def testFeatures():
    """Tests if the features work on the corpus."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews)
    showFeatureOccurrence(features, featureVectors)
예제 #24
0
def testFeatures():
    """Tests if the features work on the corpus."""
    ironicIDs, regularIDs, reviews = createTestReviews()
    features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews)
    showFeatureOccurrence(features, featureVectors)
예제 #25
0
def main(fastafile, classfile, kernel, evalmodel, paramfile):
    """
    """
    # number of cross-validation folds for evaluating model on training data
    if "trich" in fastafile:
        nfolds = 5
    else:
        nfolds = 10
    # output directory for results on training data
    resdir = "./results/"

    dname = fastafile.split("/")[-1].split(".fasta")[0]
   
    # load protein IDs and labels - in same order as .class file
    labels = OrderedDict()
    with open(classfile) as fin:
        for l in fin:
            p, l = l.rstrip().split()
            labels[p] = int(l)

    # load FASTA file
    # calculate features for each sequence 
    with open(fastafile) as fin:
        seqstr = fin.read() 

    IDs, X = extractFeatures(seqstr)
    # make sure features are in the right order 
    newX = []
    for ID in labels.keys():
        newX.append(X[IDs.index(ID)])
    X = newX

    # feature and label matrices to train model
    X = np.array(X)
    Y = np.array(labels.values())

    if evalmodel:
        # cross-val folds (leave-one-out cross-val if less than 10 ex)
        ntrain = X.shape[0]
        if ntrain <= 10:
            cv = LeaveOneOut(n=ntrain)
        else:
            cv = StratifiedKFold(Y, nfolds, shuffle=True)
        # cross-validation within dataset
        results = crossValidate(X, Y, cv, kernel, paramfile)
        results.update({'dname':dname, 'protIDs':labels})
        if not os.path.isdir(resdir):
            os.makedirs(resdir)
        resfile = resdir+dname+"_results.txt"
        printResults(results, resfile)

    """ train on all data and save final model for inference """
    # define parameters (grid to search)
    params = getParams(kernel, paramfile)
    bestC, bestKparam = selectParams(X, Y, kernel, params, nfolds)

    # standardize the data
#    X, _, scaler = standardizeData(X, None)
    scaler = None
    # construct training kernel
    K = calculateKernel(X, X, bestKparam, kernel)
    # train model
    clf = trainModel(K, Y, bestC)
    # save model
    support_vectors = X[clf.support_,:]
    savedict={'dualvars':clf.dual_coef_, 'kernel':kernel, 
              'Kparam':bestKparam, 'support_vectors':support_vectors, 
              'scaler':scaler, 'intercept':clf.intercept_}
    with open("./models/"+dname+"_model.pkl", "wb") as fout:
        pickle.dump(savedict, fout)
def applyML(trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH):
    """
    Uses machine learning approach to classify sentences.

    """
    # TODO: Add condition to create corpus, if no file exists.
    print("Training the classifiers using the set at '{path}{file}'".format(
                                                    path=setPath, 
                                                    file=trainingSetFilename))
    
    trainingSet = Corpus(trainingSetFilename, corpusPath=CORPUS_PATH)

    # trainingSet = Corpus.loadCorpus(filename="shuffled_set.pk")

    # for each in trainingSet.reviewIDs[0:10]:
    #     print(each)
    # print()

    # Get the ids - which are ordered ironic, regular - and shuffle them.
    ids = trainingSet.reviewIDs
    random.seed(44)
    random.shuffle(ids)
    # for each in ids[0:10]:
    #     print(each)
    # print()

    reviews = trainingSet.reviews

    if not testSetFilename == None:
        testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH)
        reviews = dict(trainingSet.reviews.items() + testSet.reviews.items())
        bowDictionary = createBagOfWordsDictionary(reviews)
    else:
        bowDictionary = None

    print("Extracting features...")
    trainFeatures, trainFeatureVectors = extractFeatures(ids, 
                                            trainingSet.reviews,
                                            bowDictionary=bowDictionary)

    trainTargets = []
    trainData = []
    stars = []

    trainGold = trainingSet.goldStandard
    for ID, g in trainGold.items():
        trainTargets.append(g)
        trainData.append(trainFeatureVectors[ID])
        stars.append(trainingSet.reviews[ID].stars)
    #for i, vec in enumerate(data):
    #    print(targets[i], " | ", vec)

    featureCount = sum([sum(v) for v in trainData])
    # print("Feature found: ", featureCount, "times.")


    classifiers = [DecisionTreeClassifier(),
                    SVC(kernel="linear"), 
                    SVC(), 
                    LinearSVC(), 
                    MultinomialNB(),
                    GaussianNB(),
                    RandomForestClassifier(),
                    LogisticRegression(),]

    # Cross validation
    if testSetFilename == None:
        for c in classifiers:
            applyCrossValidation(c, trainData, trainTargets)
            
            # Show star distribution for each classifier
            # applyCrossValidation(c, trainData, trainTargets, stars=stars)
        
        # scores = cross_validation.cross_val_score(classifier, array(data), 
        #                                         array(targets), cv=10)
        # print(scores)

    else:
        print("Testing the classifiers using the set at '{path}{file}'".format(
                                                    path=CORPUS_PATH, 
                                                    file=testSetFilename))

        # testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH)
        # testSet = Corpus.loadCorpus(filename="test_set.pk")

        # Create bag of words dictionary that contains words of all reviews
        # bowDictionary = createBagOfWordsDictionary(
        #                     trainingSet.reviews + testSet.reviews)


        print("Extracting features...")
        testFeatures, testFeatureVectors = extractFeatures(testSet.reviewIDs, 
                                                    testSet.reviews, 
                                                    bowDictionary=bowDictionary)

        testData = []
        testTargets = []

        testGold = testSet.goldStandard
        for ID, g in testGold.items():
            testTargets.append(g)
            testData.append(testFeatureVectors[ID])

        for c in classifiers:
            applyClassifier(c, trainData, trainTargets, testData, testTargets)
예제 #27
0
object = sorted(os.listdir(globals.data_train_images))    

i, total = 0, len(object)

with Timer() as timer:
    for subject in object:
        i += 1
        print('Processing the subdirectory named:', subject, '\t[', i , '/', total, ']', file = globals.file)

        # Read in cropped data
        crop_names = os.listdir(os.path.join(globals.data_train_images, subject))
        crop_names = list(map(lambda x: os.path.join(globals.data_train_images, subject, x), crop_names)) 
        crops = [cv.imread(x , cv.IMREAD_GRAYSCALE) for x in crop_names]

        # Get Features
        desc = features.extractFeatures(crops, features.features)
        all[subject] = desc
        print('Extracted', arguments.descriptor, '\n', file = globals.file)

print('Time:', timer, '\n', file = globals.file)

# Print
print('Done!\n')

# Print
print('Create Bag of Visual Features\n')

# Features
matrix = features.groupAllFeatures(all)

kmeans = None
예제 #28
0
def exportFeatures():
    corpus = Corpus(SET_FILENAMES[3])
    features, featureVectors = extractFeatures(corpus.reviewIDs,
                                               corpus.reviews,
                                               features=None,
                                               createARFF=True)
# define which features have to be extracted from test data (same as in loaded model).
featureConfigs = []
top10_iro_sarc_Config = [43, 57, 52, 156, 44, 56, 51, 42, 50, 49]
featureConfigs.append(("all", True, False, False, True, top10_iro_sarc_Config,
                       "features_top10Config"))

# feature extraction.
for mode, feat, regExp, new_stack, bigram, config, createARFF in featureConfigs:

    createARFF = createARFF + "_" + class1 + "_vs_" + class2

    binary_combination = False
    sentiment = False

    testFeatures, testFeatureVectors = extractFeatures(
        class1, class2, mode, arff_path, testIDs, testSet.tweets, config, feat,
        regExp, new_stack, binary_combination, sentiment, bigram, createARFF,
        bowDictionary, bigramDictionary)

# array of test data:
tsTargets = []
tsData = []

# sparse matrix of test data
rdim = len(testFeatureVectors.keys())
cdim = len(testFeatureVectors[testIDs[0]])
testData = lil_matrix((rdim, cdim))

testGold = testSet.goldStandard

testTargets = range(len(testGold))
def applyML2(trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH):
    """
    Uses machine learning approach to classify sentences.
    Implements a truly simple 'Leave One Out' function.
    """
    # TODO: Add condition to create corpus, if no file exists.
    print("Training the classifiers using the set at '{path}{file}'".format(
                                                    path=setPath, 
                                                    file=trainingSetFilename))
    
    #trainingSet = Corpus(trainingSetFilename, corpusPath=CORPUS_PATH)
    # trainingSet = Corpus.loadCorpus(filename=trainingSetFilename)
    # trainingSet = Corpus.loadCorpus(filename="training_and_validation_set.pk")
    trainingSet = Corpus.loadCorpus(filename="shuffled_set.pk")


    # for each in trainingSet.reviewIDs[0:10]:
    #     print(each)
    # print()

    # Get the ids - which are ordered ironic, regular - and shuffle them.
    ids = trainingSet.reviewIDs
    random.seed(44)
    random.shuffle(ids)
    # for each in ids[0:10]:
    #     print(each)
    # print()

    # Falls das -new flag nicht gesetzt ist ODER es keine Datei zum laden gibt,
    # erstelle den Corpus neu.


    print("Extracting features...")
#    trainFeatures, trainFeatureVectors = extractFeatures(trainingSet.reviewIDs, 
#                                                trainingSet.reviews)

    
    featureConfig = {
        "minus Imba": { u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Quotes": {u"Positive Imbalance": (u"w-\u2605 ", 
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ", 
                            negStarPolarityDiscrepancy),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Pos/Neg&Punctuation": {u"Positive Imbalance": (u"w-\u2605 ", 
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ", 
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Hyperbole": {u"Positive Imbalance": (u"w-\u2605 ", 
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ", 
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Ellipsis and Punctuation": {u"Positive Imbalance": (u"w-\u2605 ", 
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ", 
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Pos/Neg&Ellipsis": {u"Positive Imbalance": (u"w-\u2605 ", 
                            posStarPolarityDiscrepancy),
                        u"Negative Imbalance": (u"w+\u2606 ", 
                            negStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
        },
        "minus Pos": {  u"Negative Imbalance": (u"w+\u2606 ", 
                            negStarPolarityDiscrepancy),
                        u"Negative Quotes": (u"\"--\"", scareQuotesNegative),
                        u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation),
                        u"Negative Hyperbole": (u"3w-", negativeStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
        "minus Neg": {u"Positive Imbalance": (u"w-\u2605 ", 
                            posStarPolarityDiscrepancy),
                        u"Positive Quotes": (u"\"..\"", scareQuotes),
                        u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation),
                        u"Positive Hyperbole": (u"3w+", positiveStreak),
                        u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation),
                        u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")),
        },
    }



    for name, config in featureConfig.items():
        print("\n"*5, name)
        print("-"*60)
        for each in config:
            print(each)
        print()



        trainFeatures, trainFeatureVectors = extractFeatures(ids, 
                                                trainingSet.reviews, config)

        trainTargets = []
        trainData = []

        trainGold = trainingSet.goldStandard
        for ID, g in trainGold.items():
            trainTargets.append(g)
            trainData.append(trainFeatureVectors[ID])
        #for i, vec in enumerate(data):
        #    print(targets[i], " | ", vec)

        featureCount = sum([sum(v) for v in trainData])
        print("Feature found: ", featureCount, "times.")


        classifiers = [DecisionTreeClassifier(),
                        SVC(kernel="linear"), 
                        SVC(), 
                        LinearSVC(), 
                        MultinomialNB(),
                        GaussianNB(),
                        RandomForestClassifier(),
                        LogisticRegression(),]

        # Cross validation
        if testSetFilename == None:
            for c in classifiers:
                applyCrossValidation(c, trainData, trainTargets)
            
            # scores = cross_validation.cross_val_score(classifier, array(data), 
            #                                         array(targets), cv=10)
            # print(scores)

        else:
            print("Testing the classifiers using the set at '{path}{file}'".format(
                                                        path=CORPUS_PATH, 
                                                        file=testSetFilename))
            
            testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH)
            # testSet = Corpus.loadCorpus(filename="test_set.pk")

            print("Extracting features...")
            testFeatures, testFeatureVectors = extractFeatures(testSet.reviewIDs, 
                                                    testSet.reviews)

            testData = []
            testTargets = []

            testGold = testSet.goldStandard
            for ID, g in testGold.items():
                testTargets.append(g)
                testData.append(testFeatureVectors[ID])

            for c in classifiers:
                applyClassifier(c, trainData, trainTargets, testData, testTargets)