def get_texts(self): for filename in self.input: root = ET.fromstring(open(filename).read()) lang = root.attrib['lang'].lower() genre = root.attrib['type'] tree = ET.ElementTree(root) yield tokenize(clean(open(filename).read(),lang,genre,tree))
def get_texts(self): for filename in self.input: root = ET.fromstring(open(filename).read()) lang = root.attrib['lang'].lower() genre = root.attrib['type'] tree = ET.ElementTree(root) yield tokenize(clean(open(filename).read(), lang, genre, tree))
def get_texts(self): text = "" for index in self.input[0]: root = ET.fromstring(open(self.input[1][index]).read()) lang = root.attrib['lang'].lower() genre = root.attrib['type'] tree = ET.ElementTree(root) string = clean(open(self.input[1][index]).read(),lang,genre,tree) text += string yield tokenize(text)
def get_texts(self): text = "" for index in self.input[0]: root = ET.fromstring(open(self.input[1][index]).read()) lang = root.attrib['lang'].lower() genre = root.attrib['type'] tree = ET.ElementTree(root) string = clean( open(self.input[1][index]).read(), lang, genre, tree) text += string yield tokenize(text)
def run(args): #input_path = "/Users/jamarq_laptop/PAN/Software/pan-2/Data/blogs/es/" #model_path = "./Models/" #output_path = "./Outputs/" input_path = args.input output_path = args.output model_path = args.model if not input_path.endswith("/"): input_path += "/" if not output_path.endswith("/"): output_path += "/" if not model_path.endswith("/"): model_path += "/" files = [f for f in os.listdir(input_path) if f.endswith('.xml')] tree = ET.parse(input_path + files[0]) root = tree.getroot() type = root.attrib["type"] + "_" + root.attrib["lang"] clf = joblib.load(model_path + type + '.pkl') topic_model = joblib.load(model_path + type + '_topic_model.pkl') scaler = joblib.load(model_path + type + "_scaler.pkl") for file in files: features = [] tree = ET.parse(input_path + file) root = tree.getroot() aut_id = file.split("_")[0] if "." in aut_id: aut_id = aut_id.split(".")[0] lang = type.split("_")[1] doccount = tree.find("documents").attrib["count"] features.extend(topic_model.get_sim_unseen(input_path + file)) features.append(int(doccount)) xmlstr = clean(ET.tostring(root), root.attrib['lang'].lower(), root.attrib['type'], tree) xmlstr = filter(lambda x: x in string.printable, xmlstr) tokens = word_tokenize(xmlstr) nostop_tokens = removeStopWords(tokens, type.split("_")[1]) features.append(extractRepeatation(nostop_tokens, type.split("_")[1])) cap_word, cap_let = extractCapital(nostop_tokens) features.append(cap_word) features.append(cap_let) features.append(len(tokens)) LIWCDic = readDictionary(getLIWCOnLang(lang)) features.extend(extractLIWCFeatures(tokens, LIWCDic)) features.extend(extractReadabilityArray(xmlstr, nostop_tokens)) these_features.append(extractEmoticons(xmlstr, nostop_tokens)) these_features.extend(extractHTMLTags(xmlstr, nostop_tokens)) label = clf.predict(scaler.transform(features)) author = ET.Element('author') author.set("id", aut_id) author.set("type", type.split("_")[0]) author.set("lang", type.split("_")[1]) author.set("age_group", label[0].split(" ")[0]) author.set("gender", label[0].split(" ")[1]) ET.ElementTree(author).write(output_path + aut_id + ".xml") print "All done!" print '\a'
def run(args): #print "test, fix these before deployment" #input_path = "/Users/jamarq_laptop/PAN/Software/pan-2/Data/blogs/es/" #out_path = "./Models/" input_path = args.input output_path = args.output if not input_path.endswith("/"): input_path += "/" if not output_path.endswith("/"): output_path += "/" truth_in_xml = True truth_dic = dict() if os.path.isfile(input_path+"truth.txt"): print "using truth.txt file" truth_in_xml = False with open(input_path+"truth.txt","rb") as truth_in: for line in truth_in: truth_data = line.split(":::") truth_dic[truth_data[0]] = (truth_data[2],truth_data[1]) print "input path: " + input_path topic_model = TopicModels(input_path) #print example.get_divergences_seen(1) files = [f for f in os.listdir(input_path) if f.endswith('.xml')] print str(len(files)) + " files in corpus" tree = ET.parse(input_path + files[0]) root = tree.getroot() type = root.attrib["type"] + "_" + root.attrib["lang"] features = [] labels = [] ages = [] genders = [] ids = [] doc_index = 0 for file in files: these_features = [] these_features.extend(topic_model.get_sim_seen(doc_index)) tree = ET.parse(input_path + file) root = tree.getroot() these_features.append(int(tree.find("documents").attrib["count"])) xmlstr = clean(ET.tostring(root),root.attrib['lang'].lower(),root.attrib['type'],tree) xmlstr = filter(lambda x: x in string.printable, xmlstr) tokens = word_tokenize(xmlstr) nostop_tokens = removeStopWords(tokens, type.split("_")[1]) these_features.append(extractRepeatation(nostop_tokens, type.split("_")[1])) cap_word, cap_let = extractCapital(nostop_tokens) these_features.append(cap_word) these_features.append(cap_let) these_features.append(len(tokens)) LIWCDic = readDictionary(getLIWCOnLang(type.split("_")[1])) these_features.extend(extractLIWCFeatures(tokens, LIWCDic)) these_features.extend(extractReadabilityArray(xmlstr,nostop_tokens)) these_features.append(extractEmoticons(xmlstr, nostop_tokens)) these_features.extend(extractHTMLTags(xmlstr, nostop_tokens)) features.append(these_features) #Label each row id = file.split(".")[0].split("_")[0] if truth_in_xml: labels.append(root.attrib["age_group"]+" "+root.attrib["gender"].lower()) ages.append(root.attrib["age_group"]) genders.append(root.attrib["gender"].lower()) else: labels.apend(truth_dic[id][1] + " " + truth_dic[id][0].lower()) ages.append(truth_dic[id][1]) genders.append(truth_dic[id][0]) ids.append(id) features = numpy.array(features) scaler = preprocessing.MinMaxScaler() features = scaler.fit_transform(features) ''' col_max = [ max(x) for x in zip(*features) ] col_min = [ min(x) for x in zip(*features) ] for feature in features: for obs in range(0,len(feature)): val = 0 try: val = (feature[obs] - col_min[obs])/(col_max[obs]-col_min[obs]) except ZeroDivisionError: val = 0 feature[obs] = float(val) ''' writeCSV(features.tolist(), labels, ages, genders, ids, type+'_features.csv') #Train liblinear implementation of SVM clf = svm.LinearSVC() clf.fit(features, labels) joblib.dump(scaler, output_path+type+'_scaler.pkl') joblib.dump(topic_model, output_path+type+'_topic_model.pkl') joblib.dump(clf, output_path+type+'.pkl') print "All done!" print '\a'
genders.append(truth_data[1].lower()) truth_dic[truth_data[0]] = (truth_data[2]+" "+truth_data[1].lower(),truth_data[2],truth_data[1]) else: for file in files: tree = ET.parse(file) root = tree.getroot() labels.append(root.attrib["age_group"]+" "+root.attrib["gender"].lower()) ages.append(root.attrib["age_group"]) genders.append(root.attrib["gender"].lower()) tree = ET.parse(files[0]) #Clean all texts for file in files: with open(file) as infile: texts.append(clean(infile.read(),tree.getroot())) #Make train/test sets comb_skf = StratifiedKFold(labels, folds) age_skf = StratifiedKFold(ages, folds) gender_skf = StratifiedKFold(genders, folds) age_accuracy = [] gender_accuracy = [] combined_accuracy = [] #Train and fit models based on age, gender, and combined labels print "Evaluating comb models" for train, test in comb_skf:
def cleanAndRun(files, truth_files, lang): print "running on " + lang + " files" labels = [] ages = [] genders = [] texts = [] truth_dic = dict() for file in truth_files: with open(file) as intruth: for line in intruth: truth_data = line.split(":::") truth_dic[truth_data[0]] = (truth_data[2], truth_data[1].lower()) for file in files: user_id = file.split("/")[-1].split(".")[0] if user_id not in truth_dic: raise Exception("no truth data found for user" + user_id) age = truth_dic[user_id][0] gender = truth_dic[user_id][1] labels.append(age + " " + gender) ages.append(age) genders.append(gender) print "label count: " + str(len(labels)) tree = ET.parse(files[0]) #Clean all texts for file in files: with open(file) as infile: texts.append(clean(infile.read(), tree.getroot())) print "text count: " + str(len(texts)) #Make train/test sets comb_skf = StratifiedKFold(labels, folds) age_skf = StratifiedKFold(ages, folds) gender_skf = StratifiedKFold(genders, folds) age_accuracy = [] gender_accuracy = [] combined_accuracy = [] #Train and fit models based on age, gender, and combined labels print "Evaluating comb models" for train, test in comb_skf: train_files = selectIndexes(texts, train) test_files = selectIndexes(texts, test) train_labels = selectIndexes(labels, train) test_labels = selectIndexes(labels, test) combined_accuracy.append( run_classifier(train_files, test_files, train_labels, test_labels, gram_type, feature_count)) #print "All combined accuracies: " + str(combined_accuracy) print "Combined accuracy after " + str( folds) + " fold cross validation using " + gram_type + ": " + str( sum(combined_accuracy) / len(combined_accuracy)) print "Evaluating age models" for train, test in age_skf: train_files = selectIndexes(texts, train) test_files = selectIndexes(texts, test) train_ages = selectIndexes(ages, train) test_ages = selectIndexes(ages, test) age_accuracy.append( run_classifier(train_files, test_files, train_ages, test_ages, gram_type, feature_count)) #print "All age accuracies: " + str(age_accuracy) print "Age accuracy after " + str( folds) + " fold cross validation using " + gram_type + ": " + str( sum(age_accuracy) / len(age_accuracy)) print "Evaluating gender models" for train, test in gender_skf: train_files = selectIndexes(texts, train) test_files = selectIndexes(texts, test) train_genders = selectIndexes(genders, train) test_genders = selectIndexes(genders, test) gender_accuracy.append( run_classifier(train_files, test_files, train_genders, test_genders, gram_type, feature_count)) #print "All gender accuracies: " + str(gender_accuracy) print "Gender accuracy after " + str( folds) + " fold cross validation using " + gram_type + ": " + str( sum(gender_accuracy) / len(gender_accuracy))