def create_dataset_2_sample_size(input_path, output_path): essays = Essays(input_path) LABEL = essays.apply_cell_function("label",identity) READ_1_SCORE = essays.apply_cell_function("read_1_score",identity) READ_2_SCORE = essays.apply_cell_function("read_2_score",identity) FINAL_SCORE = essays.apply_cell_function("final_score",identity) # prepares text print "Preparing text..." RAW_TEXTS = essays.apply_cell_function("data_answer",identity) RAW_TEXTS = func_over_dict(RAW_TEXTS, apply_map_func(string.upper), parallel=True) TEXTS = essays.apply_cell_function("data_answer",identity) print "Cleaning text..." TEXTS = clean_text(TEXTS) print "Simplifying math expressions..." math_essays = [key for key in TEXTS.keys() if key.startswith("53299")] for key in math_essays: TEXTS[key] = TEXTS[key].map(simplify_math) print "Spellchecking..." TEXTS = func_over_dict(TEXTS, apply_map_func(spellcheck), parallel=False) print "Reducing vocabulary..." TEXTS = func_over_dict(TEXTS, reduce_vocabulary_func, parallel=True) print "Stemming..." TEXTS = func_over_dict(TEXTS, apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()])), parallel=True) bow_args = {'min_df':2,'ngram_range':(1,1),'stop_words':'english','tokenizer':lambda x: x.split()} BOW_1_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args), parallel=True) META_LABEL = func_over_dict(LABEL, lambda x: pd.DataFrame({'META_LABEL':x}), parallel=True) META_SCORE_1 = func_over_dict(READ_1_SCORE, lambda x: pd.DataFrame({'META_SCORE_1':x}), parallel=True) META_SCORE_2 = func_over_dict(READ_2_SCORE, lambda x: pd.DataFrame({'META_SCORE_2':x}), parallel=True) META_SCORE_FINAL = func_over_dict(FINAL_SCORE, lambda x: pd.DataFrame({'META_SCORE_FINAL':x}), parallel=True) TEXT_STATISTICS = func_over_dict(RAW_TEXTS, text_statistics, parallel=True) QUOTATIONS_NUM = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: len(re.findall('"(.*?)"',x)))), lambda x: pd.DataFrame({'QUOTATIONS_NUM':x}), parallel=True) YES_POSITION = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))), lambda x: pd.DataFrame({'YES_POSITION':x}), parallel=True) NO_POSITION = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))), lambda x: pd.DataFrame({'NO_POSITION':x}), parallel=True) dataset = merge_dataframes([ META_LABEL ,META_SCORE_1 ,META_SCORE_2 ,META_SCORE_FINAL ,BOW_1_GRAM ,TEXT_STATISTICS ,QUOTATIONS_NUM ,YES_POSITION ,NO_POSITION ]) dataset = [dataset, dataset] joblib.dump(dataset,output_path)
def create_dataset_2_SE(): essays = Essays("data_work/items_data_se/*.csv") LABEL = essays.apply_cell_function("label",identity) READ_1_SCORE = essays.apply_cell_function("read_1_score",identity) READ_2_SCORE = essays.apply_cell_function("read_2_score",identity) FINAL_SCORE = essays.apply_cell_function("final_score",identity) # prepares text print "Preparing text..." RAW_TEXTS = essays.apply_cell_function("data_answer",identity) RAW_TEXTS = func_over_dict(RAW_TEXTS, apply_map_func(string.upper)) TEXTS = essays.apply_cell_function("data_answer",identity) print "Cleaning text..." TEXTS = clean_text(TEXTS) print "Spellchecking..." TEXTS = func_over_dict(TEXTS, apply_map_func(lambda x: spellcheck(x,exclude=["EQUATIONINCORRECT","EQUATIONINCORRECT"]))) #for key in ["5_53299","7_46793","3_51802","7_46597"]: # TEXTS[key] = TEXTS[key].map(simplify_math) print "Reducing vocabulary..." TEXTS = reduce_vocabulary_dict(TEXTS) print "Stemming..." TEXTS = func_over_dict(TEXTS, apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()]))) bow_args = {'min_df':2,'ngram_range':(1,1),'stop_words':'english','tokenizer':lambda x: x.split()} BOW_1_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args)) #bow_args = {'min_df':5,'ngram_range':(2,2),'stop_words':'english','tokenizer':lambda x: x.split()} #BOW_2_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args)) #bow_args = {'min_df':5,'ngram_range':(3,3),'stop_words':'english','tokenizer':lambda x: x.split()} #BOW_3_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args)) META_LABEL = func_over_dict(LABEL, lambda x: pd.DataFrame({'META_LABEL':x})) META_SCORE_1 = func_over_dict(READ_1_SCORE, lambda x: pd.DataFrame({'META_SCORE_1':x})) META_SCORE_2 = func_over_dict(READ_2_SCORE, lambda x: pd.DataFrame({'META_SCORE_2':x})) META_SCORE_FINAL = func_over_dict(FINAL_SCORE, lambda x: pd.DataFrame({'META_SCORE_FINAL':x})) TEXT_STATISTICS = func_over_dict(RAW_TEXTS, text_statistics) QUOTATIONS_NUM = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: len(re.findall('"(.*?)"',x)))), lambda x: pd.DataFrame({'QUOTATIONS_NUM':x})) YES_POSITION = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))), lambda x: pd.DataFrame({'YES_POSITION':x})) NO_POSITION = func_over_dict(func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))), lambda x: pd.DataFrame({'NO_POSITION':x})) dataset = merge_dataframes([ META_LABEL ,META_SCORE_1 ,META_SCORE_2 ,META_SCORE_FINAL ,BOW_1_GRAM #,BOW_2_GRAM #,BOW_3_GRAM ,TEXT_STATISTICS ,QUOTATIONS_NUM ,YES_POSITION ,NO_POSITION ]) dataset = [dataset, dataset] joblib.dump(dataset,"data_work/datasets/dataset_2_SE")
def create_dataset_2_gaming(): essays = Essays("data_work/items_data_gaming/*.csv") LABEL = essays.apply_cell_function("label", identity) READ_1_SCORE = essays.apply_cell_function("read_1_score", identity) READ_2_SCORE = essays.apply_cell_function("read_2_score", identity) FINAL_SCORE = essays.apply_cell_function("final_score", identity) # prepares text print "Preparing text..." RAW_TEXTS = essays.apply_cell_function("data_answer", identity) RAW_TEXTS = func_over_dict(RAW_TEXTS, apply_map_func(string.upper)) TEXTS = essays.apply_cell_function("data_answer", identity) print "Cleaning text..." TEXTS = clean_text(TEXTS) print "Spellchecking..." TEXTS = func_over_dict( TEXTS, apply_map_func(lambda x: spellcheck( x, exclude=["EQUATIONINCORRECT", "EQUATIONINCORRECT"]))) #for key in ["5_53299","7_46793","3_51802","7_46597"]: # TEXTS[key] = TEXTS[key].map(simplify_math) print "Reducing vocabulary..." TEXTS = reduce_vocabulary_dict(TEXTS) print "Stemming..." TEXTS = func_over_dict( TEXTS, apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()]))) bow_args = { 'min_df': 2, 'ngram_range': (1, 1), 'stop_words': 'english', 'tokenizer': lambda x: x.split() } BOW_1_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x, **bow_args)) #bow_args = {'min_df':5,'ngram_range':(2,2),'stop_words':'english','tokenizer':lambda x: x.split()} #BOW_2_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args)) #bow_args = {'min_df':5,'ngram_range':(3,3),'stop_words':'english','tokenizer':lambda x: x.split()} #BOW_3_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x,**bow_args)) META_LABEL = func_over_dict(LABEL, lambda x: pd.DataFrame({'META_LABEL': x})) META_SCORE_1 = func_over_dict(READ_1_SCORE, lambda x: pd.DataFrame({'META_SCORE_1': x})) META_SCORE_2 = func_over_dict(READ_2_SCORE, lambda x: pd.DataFrame({'META_SCORE_2': x})) META_SCORE_FINAL = func_over_dict( FINAL_SCORE, lambda x: pd.DataFrame({'META_SCORE_FINAL': x})) TEXT_STATISTICS = func_over_dict(RAW_TEXTS, text_statistics) QUOTATIONS_NUM = func_over_dict( func_over_dict( RAW_TEXTS, apply_map_func(lambda x: len(re.findall('"(.*?)"', x)))), lambda x: pd.DataFrame({'QUOTATIONS_NUM': x})) YES_POSITION = func_over_dict( func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))), lambda x: pd.DataFrame({'YES_POSITION': x})) NO_POSITION = func_over_dict( func_over_dict(RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))), lambda x: pd.DataFrame({'NO_POSITION': x})) dataset = merge_dataframes([ META_LABEL, META_SCORE_1, META_SCORE_2, META_SCORE_FINAL, BOW_1_GRAM #,BOW_2_GRAM #,BOW_3_GRAM , TEXT_STATISTICS, QUOTATIONS_NUM, YES_POSITION, NO_POSITION ]) dataset = [dataset, dataset] joblib.dump(dataset, "data_work/datasets/dataset_2_gaming")
def create_dataset_2_sample_size(input_path, output_path): essays = Essays(input_path) LABEL = essays.apply_cell_function("label", identity) READ_1_SCORE = essays.apply_cell_function("read_1_score", identity) READ_2_SCORE = essays.apply_cell_function("read_2_score", identity) FINAL_SCORE = essays.apply_cell_function("final_score", identity) # prepares text print "Preparing text..." RAW_TEXTS = essays.apply_cell_function("data_answer", identity) RAW_TEXTS = func_over_dict(RAW_TEXTS, apply_map_func(string.upper), parallel=True) TEXTS = essays.apply_cell_function("data_answer", identity) print "Cleaning text..." TEXTS = clean_text(TEXTS) print "Simplifying math expressions..." math_essays = [key for key in TEXTS.keys() if key.startswith("53299")] for key in math_essays: TEXTS[key] = TEXTS[key].map(simplify_math) print "Spellchecking..." TEXTS = func_over_dict(TEXTS, apply_map_func(spellcheck), parallel=False) print "Reducing vocabulary..." TEXTS = func_over_dict(TEXTS, reduce_vocabulary_func, parallel=True) print "Stemming..." TEXTS = func_over_dict( TEXTS, apply_map_func(lambda x: " ".join([stemmer(w) for w in x.split()])), parallel=True) bow_args = { 'min_df': 2, 'ngram_range': (1, 1), 'stop_words': 'english', 'tokenizer': lambda x: x.split() } BOW_1_GRAM = func_over_dict(TEXTS, lambda x: bag_of_words(x, **bow_args), parallel=True) META_LABEL = func_over_dict(LABEL, lambda x: pd.DataFrame({'META_LABEL': x}), parallel=True) META_SCORE_1 = func_over_dict(READ_1_SCORE, lambda x: pd.DataFrame({'META_SCORE_1': x}), parallel=True) META_SCORE_2 = func_over_dict(READ_2_SCORE, lambda x: pd.DataFrame({'META_SCORE_2': x}), parallel=True) META_SCORE_FINAL = func_over_dict( FINAL_SCORE, lambda x: pd.DataFrame({'META_SCORE_FINAL': x}), parallel=True) TEXT_STATISTICS = func_over_dict(RAW_TEXTS, text_statistics, parallel=True) QUOTATIONS_NUM = func_over_dict( func_over_dict( RAW_TEXTS, apply_map_func(lambda x: len(re.findall('"(.*?)"', x)))), lambda x: pd.DataFrame({'QUOTATIONS_NUM': x}), parallel=True) YES_POSITION = func_over_dict(func_over_dict( RAW_TEXTS, apply_map_func(lambda x: x.find("YES"))), lambda x: pd.DataFrame({'YES_POSITION': x}), parallel=True) NO_POSITION = func_over_dict(func_over_dict( RAW_TEXTS, apply_map_func(lambda x: x.find("NO"))), lambda x: pd.DataFrame({'NO_POSITION': x}), parallel=True) dataset = merge_dataframes([ META_LABEL, META_SCORE_1, META_SCORE_2, META_SCORE_FINAL, BOW_1_GRAM, TEXT_STATISTICS, QUOTATIONS_NUM, YES_POSITION, NO_POSITION ]) dataset = [dataset, dataset] joblib.dump(dataset, output_path)
,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0) ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_10_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],10)/100.0) ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_18_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],18)/100.0) ,FunctionalTextEssayFeature(feature_name="n_sentences_longer_than_25_raw", fun=lambda essay: n_sentences_longer_than(essay.texts["raw"],25)/100.0) ,EssaySkipgram(name="LETTER",source="clean",base=lambda text: text, nskip=0, ngram=3) ,EssaySkipgram(name="WORD",source="clean",base=lambda text: text.split(), nskip=0, ngram=1) ] } pipeline_2 = { "name":"DATASET_2", "steps":[ EssayTextConversion(source="raw",dest="clean",fun=safe_clean_text) ,EssayTextConversion(source="clean",dest="clean",fun=text_to_math) ,EssayTextConversion(source="clean",dest="stem",fun=lambda text: " ".join([stemmer(t) for t in text.split()])) ,EssayTextConversion(source="clean",dest="pos",fun=lambda text: " ".join([k[1] for k in TextBlob(text).tags])) ,EssayFeature(fun=lambda essay: get_math_expressions_features(essay.texts["clean"])) ,FunctionalTextEssayFeature(feature_name="n_words_raw", fun=lambda essay: n_words(essay.texts["raw"])/1000.0) ,FunctionalTextEssayFeature(feature_name="text_length_raw", fun=lambda essay: (text_length(essay.texts["raw"]))/1000.0) ,FunctionalTextEssayFeature(feature_name="text_length_2nd_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.50)/1000.0) ,FunctionalTextEssayFeature(feature_name="text_length_4th_root_raw", fun=lambda essay: (text_length(essay.texts["raw"])**0.25)/1000.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_4_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],4)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_6_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],6)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_8_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],8)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_10_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],10)/100.0) ,FunctionalTextEssayFeature(feature_name="n_words_longer_than_12_raw", fun=lambda essay: n_words_longer_than(essay.texts["raw"],12)/100.0) ,FunctionalTextEssayFeature(feature_name="words_length_mean_raw", fun=lambda essay: words_length_mean(essay.texts["raw"])/100.0) ,FunctionalTextEssayFeature(feature_name="words_length_variance_raw", fun=lambda essay: words_length_variance(essay.texts["raw"])/100.0) ,FunctionalTextEssayFeature(feature_name="unique_words_norm_raw", fun=lambda essay: unique_words_norm(essay.texts["raw"])/100.0)