def feature_three(text): words = get_words(text) poly_syllables_words = 0 for word in words: if count_syllables(word) > 1: poly_syllables_words += 1 return poly_syllables_words / total_sentences(text)
def feature_two(text): words = get_words(text) poly_syllables_words = 0 for word in words: if count_syllables(word) > 1: poly_syllables_words += 1 return poly_syllables_words/len(words)*100
def feature_two(text): words = get_words(text) poly_syllables_words = 0 for word in words: if count_syllables(word) > 1: poly_syllables_words += 1 return poly_syllables_words / len(words) * 100
def feature_three(text): words = get_words(text) poly_syllables_words = 0 for word in words: if count_syllables(word) > 1: poly_syllables_words += 1 return poly_syllables_words/total_sentences(text)
def feature_seven(text): words = get_words(text) syllables_sum = 0 for word in words: syllables_sum += count_syllables(word) return 0.39 * len(words)/total_sentences(text) + 11.8 * syllables_sum/len(words) - 15.59
def feature_seven(text): words = get_words(text) syllables_sum = 0 for word in words: syllables_sum += count_syllables(word) return 0.39 * len(words) / total_sentences( text) + 11.8 * syllables_sum / len(words) - 15.59
def extract_features(data, pos): extr_words = extract_words(data, pos) words = extr_words.words unique_words = extr_words.unique_words total_w = len(get_words(data)) total_unique_w = len(np.unique(get_words(data))) total_s = total_sentences(data) feature1 = words/total_w*100 feature2 = unique_words/total_w*100 feature3 = unique_words/total_unique_w feature4 = words/total_s feature5 = unique_words/total_s return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data, pos): extr_words = extract_words(data, pos) words = extr_words.words unique_words = extr_words.unique_words total_w = len(get_words(data)) total_unique_w = len(np.unique(get_words(data))) total_s = total_sentences(data) feature1 = words / total_w * 100 feature2 = unique_words / total_w * 100 feature3 = unique_words / total_unique_w feature4 = words / total_s feature5 = unique_words / total_s return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data, pos_type): root = ElementTree.fromstring(data) pure_text = root[0].text extr_words = extract_words(data, pos_type) words = extr_words.words unique_words = extr_words.unique_words total_w = len(get_words(pure_text)) total_unique_w = len(np.unique(get_words(pure_text))) total_s = total_sentences(pure_text) feature1 = words/total_w*100 feature2 = unique_words/total_w*100 feature3 = unique_words/total_unique_w feature4 = words/total_s feature5 = unique_words/total_s return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data, pos_type): root = ElementTree.fromstring(data) pure_text = root[0].text extr_words = extract_words(data, pos_type) words = extr_words.words unique_words = extr_words.unique_words total_w = len(get_words(pure_text)) total_unique_w = len(np.unique(get_words(pure_text))) total_s = total_sentences(pure_text) feature1 = words / total_w * 100 feature2 = unique_words / total_w * 100 feature3 = unique_words / total_unique_w feature4 = words / total_s feature5 = unique_words / total_s return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data): root = ElementTree.fromstring(data) pure_text = root[0].text ne = extract_entities_api(data) tw = len(get_words(pure_text)) ts = total_sentences(pure_text) feature1 = ne/tw*100 feature2 = ne/ts*100 return [feature1, feature2]
def extract_features(data): root = ElementTree.fromstring(data) pure_text = root[0].text ne = extract_entities_api(data) tw = len(get_words(pure_text)) ts = total_sentences(pure_text) feature1 = ne / tw * 100 feature2 = ne / ts * 100 return [feature1, feature2]
def feature_five(text): path_difficult_words = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/DaleChallEasyWordList.txt" words = get_words(text) difficult_words_sum = 0 with open(path_difficult_words, 'r') as f: difficult_words = f.readlines() for word in words: if word not in difficult_words: difficult_words_sum += 1 return 0.0496 * len(words)/total_sentences(text) + 0.1579 * difficult_words_sum/len(words) * 100 + 3.6365
def extract_features(data): extr_entities = extract_entities(data) ne = extr_entities.ne tw = len(get_words(data)) ts = total_sentences(data) feature1 = ne/tw*100 feature2 = ne/ts*100 print str(feature1) + " " + str(feature2) return [feature1, feature2]
def extract_features(data): extr_entities = extract_entities(data) ne = extr_entities.ne tw = len(get_words(data)) ts = total_sentences(data) feature1 = ne / tw * 100 feature2 = ne / ts * 100 print str(feature1) + " " + str(feature2) return [feature1, feature2]
def feature_five(text): path_difficult_words = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/DaleChallEasyWordList.txt" words = get_words(text) difficult_words_sum = 0 with open(path_difficult_words, 'r') as f: difficult_words = f.readlines() for word in words: if word not in difficult_words: difficult_words_sum += 1 return 0.0496 * len(words) / total_sentences( text) + 0.1579 * difficult_words_sum / len(words) * 100 + 3.6365
def extract_features(data): prefixes = load_data('pref.txt') suffixes = load_data('suff.txt') prefix_num = 0 suffix_num = 0 words = get_words(data) for word in words: word = word.lower() word = (word.encode('utf-8')).lower() prefix_cand = [] suffix_cand = [] for prefix in prefixes: prefix = prefix.replace('\r\n', '') try: if word.index(prefix) == 0: prefix_cand.append(prefix) except: pass if len(prefix_cand) != 0: prefix_num += 1 for suffix in suffixes: suffix = suffix.replace('\r\n', '') try: if word.index(suffix) > 2: suffix_cand.append(suffix) except: pass if len(suffix_cand) != 0: suffix_num += 1 # print [prefix_num/len(words), suffix_num/len(words)] # return [prefix_num/len(words), suffix_num/len(words)] print suffix_num/len(words) return [suffix_num/len(words)]
def feature_eight(text): return len(get_words(text))
def extract_features(data, clf): prefixes = load_data('pref.txt') suffixes = load_data('suff.txt') dict_1 = dict() dict_2 = dict() i = 0 for prefix in prefixes: prefix = prefix.replace('\r\n', '') dict_1[prefix] = i i += 1 i = 0 for suffix in suffixes: suffix = suffix.replace('\r\n', '') dict_2[suffix] = i i += 1 words = get_words(data) borrowed_num = 0 original_num = 0 for word in words: word = word.lower() word = word.encode('utf-8') prefix_cand = [] suffix_cand = [] for prefix in prefixes: prefix = prefix.replace('\r\n', '') try: if word.index(prefix) == 0: prefix_cand.append(prefix) except: pass if len(prefix_cand) == 0: prefix = 'none' else: prefix = max(prefix_cand, key=len) for suffix in suffixes: suffix = suffix.replace('\r\n', '') try: if word.index(suffix) > 2: suffix_cand.append(suffix) except: pass if len(suffix_cand) == 0: suffix = 'none' else: suffix = max(suffix_cand, key=len) arr = [] for key in dict_1: if key != prefix: arr.append(0) else: arr.append(1) for key in dict_2: if key != suffix: arr.append(0) else: arr.append(1) if suffix != 'none' or prefix != 'none': if clf.predict(arr)[0] == 'borrowed': borrowed_num += 1 if clf.predict(arr)[0] == 'original': original_num += 1 return [borrowed_num/len(words)*100]
def feature_six(text): return len(get_words(text))/total_sentences(text)
def extract_features(data, clf): prefixes = load_data("pref.txt") suffixes = load_data("suff.txt") dict_1 = dict() dict_2 = dict() i = 0 for prefix in prefixes: prefix = prefix.replace("\r\n", "") dict_1[prefix] = i i += 1 i = 0 for suffix in suffixes: suffix = suffix.replace("\r\n", "") dict_2[suffix] = i i += 1 words = get_words(data) borrowed_num = 0 original_num = 0 for word in words: word = word.lower() word = word.encode("utf-8") prefix_cand = [] suffix_cand = [] for prefix in prefixes: prefix = prefix.replace("\r\n", "") try: if word.index(prefix) == 0: prefix_cand.append(prefix) except: pass if len(prefix_cand) == 0: prefix = "none" else: prefix = max(prefix_cand, key=len) for suffix in suffixes: suffix = suffix.replace("\r\n", "") try: if word.index(suffix) > 2: suffix_cand.append(suffix) except: pass if len(suffix_cand) == 0: suffix = "none" else: suffix = max(suffix_cand, key=len) arr = [] for key in dict_1: if key != prefix: arr.append(0) else: arr.append(1) for key in dict_2: if key != suffix: arr.append(0) else: arr.append(1) if suffix != "none" or prefix != "none": if clf.predict(arr)[0] == "borrowed": borrowed_num += 1 if clf.predict(arr)[0] == "original": original_num += 1 return [borrowed_num / len(words) * 100]
def feature_four(text): words = get_words(text) characters_sum = 0 for word in words: characters_sum += len(word) return characters_sum/len(words)
def feature_one(text): words = get_words(text) syllables_sum = 0 for word in words: syllables_sum += count_syllables(word) return syllables_sum / len(words)
def feature_one(text): words = get_words(text) syllables_sum = 0 for word in words: syllables_sum += count_syllables(word) return syllables_sum/len(words)
def feature_four(text): words = get_words(text) characters_sum = 0 for word in words: characters_sum += len(word) return characters_sum / len(words)
def feature_six(text): return len(get_words(text)) / total_sentences(text)