Пример #1
0
	def get_texts(self): 
		for filename in self.input:
			root = ET.fromstring(open(filename).read())
			lang = root.attrib['lang'].lower()
			genre = root.attrib['type']
			tree = ET.ElementTree(root)
			yield tokenize(clean(open(filename).read(),lang,genre,tree))
Пример #2
0
 def get_texts(self):
     for filename in self.input:
         root = ET.fromstring(open(filename).read())
         lang = root.attrib['lang'].lower()
         genre = root.attrib['type']
         tree = ET.ElementTree(root)
         yield tokenize(clean(open(filename).read(), lang, genre, tree))
Пример #3
0
	def get_texts(self): 
		text = ""
		for index in self.input[0]:
			root = ET.fromstring(open(self.input[1][index]).read())
			lang = root.attrib['lang'].lower()
			genre = root.attrib['type']
			tree = ET.ElementTree(root)
			string = clean(open(self.input[1][index]).read(),lang,genre,tree)
			text += string
		yield tokenize(text)
Пример #4
0
 def get_texts(self):
     text = ""
     for index in self.input[0]:
         root = ET.fromstring(open(self.input[1][index]).read())
         lang = root.attrib['lang'].lower()
         genre = root.attrib['type']
         tree = ET.ElementTree(root)
         string = clean(
             open(self.input[1][index]).read(), lang, genre, tree)
         text += string
     yield tokenize(text)
Пример #5
0
def run(args):
    #input_path = "/Users/jamarq_laptop/PAN/Software/pan-2/Data/blogs/es/"
    #model_path = "./Models/"
    #output_path = "./Outputs/"

    input_path = args.input
    output_path = args.output
    model_path = args.model

    if not input_path.endswith("/"):
        input_path += "/"
    if not output_path.endswith("/"):
        output_path += "/"
    if not model_path.endswith("/"):
        model_path += "/"

    files = [f for f in os.listdir(input_path) if f.endswith('.xml')]

    tree = ET.parse(input_path + files[0])
    root = tree.getroot()

    type = root.attrib["type"] + "_" + root.attrib["lang"]

    clf = joblib.load(model_path + type + '.pkl')
    topic_model = joblib.load(model_path + type + '_topic_model.pkl')
    scaler = joblib.load(model_path + type + "_scaler.pkl")

    for file in files:

        features = []
        tree = ET.parse(input_path + file)
        root = tree.getroot()

        aut_id = file.split("_")[0]

        if "." in aut_id:
            aut_id = aut_id.split(".")[0]

        lang = type.split("_")[1]
        doccount = tree.find("documents").attrib["count"]

        features.extend(topic_model.get_sim_unseen(input_path + file))

        features.append(int(doccount))

        xmlstr = clean(ET.tostring(root), root.attrib['lang'].lower(),
                       root.attrib['type'], tree)
        xmlstr = filter(lambda x: x in string.printable, xmlstr)
        tokens = word_tokenize(xmlstr)
        nostop_tokens = removeStopWords(tokens, type.split("_")[1])

        features.append(extractRepeatation(nostop_tokens, type.split("_")[1]))

        cap_word, cap_let = extractCapital(nostop_tokens)
        features.append(cap_word)
        features.append(cap_let)

        features.append(len(tokens))

        LIWCDic = readDictionary(getLIWCOnLang(lang))
        features.extend(extractLIWCFeatures(tokens, LIWCDic))

        features.extend(extractReadabilityArray(xmlstr, nostop_tokens))

        these_features.append(extractEmoticons(xmlstr, nostop_tokens))

        these_features.extend(extractHTMLTags(xmlstr, nostop_tokens))

        label = clf.predict(scaler.transform(features))

        author = ET.Element('author')
        author.set("id", aut_id)
        author.set("type", type.split("_")[0])
        author.set("lang", type.split("_")[1])
        author.set("age_group", label[0].split(" ")[0])
        author.set("gender", label[0].split(" ")[1])

        ET.ElementTree(author).write(output_path + aut_id + ".xml")

    print "All done!"
    print '\a'
Пример #6
0
def run(args):
	#print "test, fix these before deployment"
	#input_path = "/Users/jamarq_laptop/PAN/Software/pan-2/Data/blogs/es/"
	#out_path = "./Models/"

    input_path = args.input
    output_path = args.output

    if not input_path.endswith("/"):
        input_path += "/"
    if not output_path.endswith("/"):
        output_path += "/"

    truth_in_xml = True
    truth_dic = dict()

    if os.path.isfile(input_path+"truth.txt"):
        print "using truth.txt file"
        truth_in_xml = False
        with open(input_path+"truth.txt","rb") as truth_in:
            for line in truth_in:
                truth_data = line.split(":::")
                truth_dic[truth_data[0]] = (truth_data[2],truth_data[1])

    print "input path: " + input_path
    topic_model = TopicModels(input_path)
    #print example.get_divergences_seen(1)

    files = [f for f in os.listdir(input_path) if f.endswith('.xml')]
    print str(len(files)) + " files in corpus"

    tree = ET.parse(input_path + files[0])
    root = tree.getroot()

    type = root.attrib["type"] + "_" + root.attrib["lang"]

    features = []
    labels = []
    ages = []
    genders = []
    ids = []

    doc_index = 0
    for file in files:

    	these_features = []

        these_features.extend(topic_model.get_sim_seen(doc_index))

    	tree = ET.parse(input_path + file)
    	root = tree.getroot()
    	these_features.append(int(tree.find("documents").attrib["count"]))
    	
    	xmlstr = clean(ET.tostring(root),root.attrib['lang'].lower(),root.attrib['type'],tree)
    	xmlstr = filter(lambda x: x in string.printable, xmlstr)
    	tokens = word_tokenize(xmlstr)
    	nostop_tokens = removeStopWords(tokens, type.split("_")[1])    

        these_features.append(extractRepeatation(nostop_tokens, type.split("_")[1]))
		
        cap_word, cap_let = extractCapital(nostop_tokens)
        these_features.append(cap_word)
        these_features.append(cap_let)

        these_features.append(len(tokens))

        LIWCDic = readDictionary(getLIWCOnLang(type.split("_")[1]))
        these_features.extend(extractLIWCFeatures(tokens, LIWCDic))

        these_features.extend(extractReadabilityArray(xmlstr,nostop_tokens))

        these_features.append(extractEmoticons(xmlstr, nostop_tokens))

        these_features.extend(extractHTMLTags(xmlstr, nostop_tokens))

        features.append(these_features)

        #Label each row
        id = file.split(".")[0].split("_")[0]
        if truth_in_xml:
            labels.append(root.attrib["age_group"]+" "+root.attrib["gender"].lower())
            ages.append(root.attrib["age_group"])
            genders.append(root.attrib["gender"].lower())
        else:
            labels.apend(truth_dic[id][1] + " " + truth_dic[id][0].lower())
            ages.append(truth_dic[id][1])
            genders.append(truth_dic[id][0])

        ids.append(id)

    features = numpy.array(features)
    scaler = preprocessing.MinMaxScaler()
    features = scaler.fit_transform(features)

    '''
    col_max = [ max(x) for x in zip(*features) ]
    col_min = [ min(x) for x in zip(*features) ]
    for feature in features:
        for obs in range(0,len(feature)):
            val = 0
            try:
                val = (feature[obs] - col_min[obs])/(col_max[obs]-col_min[obs])
            except ZeroDivisionError:
                val = 0
            feature[obs] = float(val)
    '''

    writeCSV(features.tolist(), labels, ages, genders, ids, type+'_features.csv')
		
	#Train liblinear implementation of SVM
    clf = svm.LinearSVC()
    clf.fit(features, labels)  

    joblib.dump(scaler, output_path+type+'_scaler.pkl')
    joblib.dump(topic_model, output_path+type+'_topic_model.pkl')
    joblib.dump(clf, output_path+type+'.pkl') 
    print "All done!"
    print '\a'
Пример #7
0
					genders.append(truth_data[1].lower())
					truth_dic[truth_data[0]] = (truth_data[2]+" "+truth_data[1].lower(),truth_data[2],truth_data[1])
	else:
		for file in files:
			tree = ET.parse(file)
			root = tree.getroot()
			labels.append(root.attrib["age_group"]+" "+root.attrib["gender"].lower())
			ages.append(root.attrib["age_group"])
			genders.append(root.attrib["gender"].lower())

	tree = ET.parse(files[0])

	#Clean all texts
	for file in files:
		with open(file) as infile:
			texts.append(clean(infile.read(),tree.getroot()))
	
	#Make train/test sets

	comb_skf = StratifiedKFold(labels, folds)
	age_skf = StratifiedKFold(ages, folds)
	gender_skf = StratifiedKFold(genders, folds)

	age_accuracy = []
	gender_accuracy = []
	combined_accuracy = []

	#Train and fit models based on age, gender, and combined labels
	print "Evaluating comb models"
	for train, test in comb_skf:
Пример #8
0
def cleanAndRun(files, truth_files, lang):
    print "running on " + lang + " files"

    labels = []
    ages = []
    genders = []

    texts = []
    truth_dic = dict()

    for file in truth_files:
        with open(file) as intruth:
            for line in intruth:
                truth_data = line.split(":::")
                truth_dic[truth_data[0]] = (truth_data[2],
                                            truth_data[1].lower())

    for file in files:
        user_id = file.split("/")[-1].split(".")[0]
        if user_id not in truth_dic:
            raise Exception("no truth data found for user" + user_id)

        age = truth_dic[user_id][0]
        gender = truth_dic[user_id][1]
        labels.append(age + " " + gender)
        ages.append(age)
        genders.append(gender)

    print "label count: " + str(len(labels))

    tree = ET.parse(files[0])

    #Clean all texts
    for file in files:
        with open(file) as infile:
            texts.append(clean(infile.read(), tree.getroot()))

    print "text count: " + str(len(texts))

    #Make train/test sets

    comb_skf = StratifiedKFold(labels, folds)
    age_skf = StratifiedKFold(ages, folds)
    gender_skf = StratifiedKFold(genders, folds)

    age_accuracy = []
    gender_accuracy = []
    combined_accuracy = []

    #Train and fit models based on age, gender, and combined labels
    print "Evaluating comb models"
    for train, test in comb_skf:

        train_files = selectIndexes(texts, train)
        test_files = selectIndexes(texts, test)

        train_labels = selectIndexes(labels, train)
        test_labels = selectIndexes(labels, test)

        combined_accuracy.append(
            run_classifier(train_files, test_files, train_labels, test_labels,
                           gram_type, feature_count))

    #print "All combined accuracies: " + str(combined_accuracy)
    print "Combined accuracy after " + str(
        folds) + " fold cross validation using " + gram_type + ": " + str(
            sum(combined_accuracy) / len(combined_accuracy))

    print "Evaluating age models"
    for train, test in age_skf:
        train_files = selectIndexes(texts, train)
        test_files = selectIndexes(texts, test)

        train_ages = selectIndexes(ages, train)
        test_ages = selectIndexes(ages, test)

        age_accuracy.append(
            run_classifier(train_files, test_files, train_ages, test_ages,
                           gram_type, feature_count))

    #print "All age accuracies: " + str(age_accuracy)
    print "Age accuracy after " + str(
        folds) + " fold cross validation using " + gram_type + ": " + str(
            sum(age_accuracy) / len(age_accuracy))

    print "Evaluating gender models"
    for train, test in gender_skf:
        train_files = selectIndexes(texts, train)
        test_files = selectIndexes(texts, test)

        train_genders = selectIndexes(genders, train)
        test_genders = selectIndexes(genders, test)

        gender_accuracy.append(
            run_classifier(train_files, test_files, train_genders,
                           test_genders, gram_type, feature_count))

    #print "All gender accuracies: " + str(gender_accuracy)
    print "Gender accuracy after " + str(
        folds) + " fold cross validation using " + gram_type + ": " + str(
            sum(gender_accuracy) / len(gender_accuracy))