def proximity_search(k, terms): k += 1 result = [] proximities = [] t1 = singularize(terms[0]) t2 = singularize(terms[1]) l1 = index[t1] l2 = index[t2] ptr1 = 0 ptr2 = 0 while ptr1 < len(l1) and ptr2 < len(l2): doc1 = l1[ptr1] doc2 = l2[ptr2] if (doc1["doc_id"] == doc2["doc_id"]): l = [] pos_ptr1 = 0 pos_ptr2 = 0 pos_pairs = [] while pos_ptr1 < len(doc1["positions"]): while pos_ptr2 < len(doc2["positions"]): if abs(doc1["positions"][pos_ptr1] - doc2["positions"][pos_ptr2]) <= k: l.append(doc2["positions"][pos_ptr2]) elif doc2["positions"][pos_ptr2] > doc1["positions"][ pos_ptr1]: break pos_ptr2 += 1 while l and abs(l[0] - doc1["positions"][pos_ptr1]) > k: l.remove(l[0]) for position in l: pos_pairs.append([doc1["positions"][pos_ptr1], position]) pos_ptr1 += 1 if pos_pairs: result.append({ "doc_id": doc1["doc_id"], "doc_name": doc_index[str(doc1["doc_id"])], "doc_snippet": doc1["doc_snippet"], "positions": doc1["positions"] }) t1_pos = doc1["doc_snippet"].find(t1) t2_pos = doc1["doc_snippet"].find(t2) proximities.append(doc1["doc_snippet"][t1_pos:t2_pos + len(t2)]) ptr1 += 1 ptr2 += 1 elif doc1["doc_id"] < doc2["doc_id"]: ptr1 += 1 else: ptr2 += 1 return result, proximities
def index_doc(word, doc_id, i, snippet): if word == '*': index['*'].append({"doc_id": doc_id}) else: word = ''.join(filter(lambda x: x in printable, word)) if (word not in stopwords): word = singularize(word) if (word and word in simple_index): if (doc_id not in simple_index[word]): word_doc_ptr[word] += 1 index[word].append({ "doc_id": doc_id, "doc_snippet": snippet, "positions": [i], }) simple_index[word].append(doc_id) else: doc = index[word][word_doc_ptr[word]] doc["positions"].append(i) else: simple_index[word] = [doc_id] index[word] = [{ "doc_id": doc_id, "doc_snippet": snippet, "positions": [i] }] word_doc_ptr[word] = 0
def data_split(datas): # 将读取的数据处理为字典列表 result = {} id = {} name = {} for i in range(len(datas)): data = datas[i] recipe = {} # for data in datas: dst = {} # print(len(data)) for index in range(2, len(data)): # print(index) dec = data[index].split('#') temp = dec[0] temp = re.sub('fresh|frozen|large|small|chunks', '', temp) # 去掉部分无关紧要形容词 temp = singularize(temp) if temp not in materials: materials.append(temp) dst[temp] = dec[1] # print(dst) result[i] = dst name[i] = data[1] id[data[1]] = data[0] # print(result) return result, name, id
def emailWordsBagBayesian(words): emailWordsBagBayes = 0 wordsCount = 0 for word in words: if len(word) < 2: continue word = WordNetLemmatizer().lemmatize(word, 'v') word = singularize(word) emailWordsBagBayes += wordsBagBayesian(word) wordsCount += 1 return emailWordsBagBayes / wordsCount if wordsCount != 0 else 0
def updateWordsValue(emails, status): for email in emails: for word in email.split(): if len(word) < 2: continue word = WordNetLemmatizer().lemmatize(word, 'v') word = singularize(word) if word not in wordsValue: wordsValue[word] = 0 wordsValue[ word] += 1 / hamsNumber if status == 'ham' else -1 / spamsNumber
def updateWordsRepeat(emails, wordsRepeat, status): global wordsNumberOfHams, wordsNumberOfSpams for email in emails: for word in email.split(): if len(word) < 2: continue word = WordNetLemmatizer().lemmatize(word, 'v') word = singularize(word) if word not in wordsRepeat: wordsRepeat[word] = 0 wordsRepeat[word] += 1 if status == 'ham': wordsNumberOfHams += 1 else: wordsNumberOfSpams += 1
def data_split(datas): # 将读取的数据处理为字典列表 result = {} id = {} name = {} t = {} for i in range(len(datas)): data = datas[i] recipe = {} # for data in datas: dst = {} # print(len(data)) ingredient = {} for index in range(2, len(data)): # print(index) dec = data[index].split('#') a = dec[0] b = a b = singularize(b) temp = dec[0].lower() temp = re.sub('fresh|frozen|large|small|chunks', '', temp) # 去掉部分无关紧要形容词 temp = singularize(temp) if temp not in materials: materials.append(temp) dst[temp] = dec[1] ingredient[dec[0]] = temp if a != b: t[a] = b t[b] = a # print(dst) result[data[0]] = dst name[data[0]] = ingredient id[data[0]] = data[1] # print(result) return result, name, id, t
def setEmailsValue(emails): values = [] for email in emails: value = 0 wordsProcessNumber = 0 for word in email.split(): word = WordNetLemmatizer().lemmatize(word, 'v') word = singularize(word) if word not in wordsValue: continue value += wordsValue[word] wordsProcessNumber += 1 value /= wordsProcessNumber if wordsProcessNumber != 0 else 1 values.append(value) return values
def data_split(datas): # 将读取的数据处理为字典列表 result = [] for data in datas: dst = {} # print(len(data)) for index in range(2, len(data)): # print(index) dec = data[index].split('#') temp = dec[0] temp = re.sub('fresh|frozen|large|small|chunks', '', temp) # 去掉部分无关紧要形容词 temp = singularize(temp) if temp not in materials: materials.append(temp) dst[temp] = dec[1] # print(dst) result.append(dst) # print(result) return result
def query(query_str): query_str = query_str.split(" ") q = np.zeros((len(vocab))) for term in query_str: term = ''.join(filter(lambda x: x in printable, term)) if (term not in stopwords): term = singularize(term) term = re.sub(r'ly$', r'', term) term = re.sub(r'ed$', r'', term) term = re.sub(r'ing$', r'', term) term = re.sub(r'nes$', r'', term) if len(term) >= 3: term_index = vocab.index(term) q[term_index] = idf[term] q /= np.linalg.norm(q) alpha = 0.001 S = np.dot(D, q) idx = np.arange(S.size)[S >= alpha] res = list(map(int, sorted(idx[np.argsort(S[idx])] + 1))) return res
doc_tf = {} idf = {} for i in range(1, 51): with open(os.path.join(DATASET_DIR, str(i) + '.txt')) as file: words = file.read() words = re.sub(r'\n|--', r' ', words) words = re.sub(r'“|”|’|‘|;|,|!|:|\.|\?|\)|\(|\*', r'', words) words = words.lower() words = re.split(r" |-|\u2014", words) words = [word for word in words if word] for word in words: word = ''.join(filter(lambda x: x in printable, word)) if (word not in stopwords): word = singularize(word) word = re.sub(r'ly$', r'', word) word = re.sub(r'ed$', r'', word) word = re.sub(r'ing$', r'', word) word = re.sub(r'nes$', r'', word) if len(word) >= 3: if (i, word) not in doc_tf: doc_tf[i, word] = 1 if word not in idf: idf[word] = 1 else: idf[word] += 1 else: doc_tf[i, word] += 1
def clean_job_resp_col(extract_job_data): stop_words = set(stopwords.words('english')) remove_punc = str.maketrans('', '', string.punctuation) extract_job_data['JobDescription'] = extract_job_data['JobDescription']\ .apply(lambda x: ' '.join([singularize(word) for word in str(x).translate(remove_punc).split() if word.lower() not in (stop_words)])) print('done')
def __call__(self, document): if self.delete_punctuation_marks: c = re.compile('[{}]'.format(re.escape(string.punctuation))) document = c.sub('', document) if self.delete_numeral: c = re.compile('[{}]'.format(re.escape(string.digits))) document = c.sub('', document) document = document.lower() texts = document.split() if self.stop_words is not None: texts = [word for word in texts if word not in self.stop_words] words = [] load.polyglot_path = os.path.join(os.getcwd(), 'polyglot_data') os.path.sep = '/' n = ['NOUN', 'PRON', 'PROPN'] a = ['ADJ', 'ADP'] v = ['VERB', 'ADV', 'AUX'] # initial_texts = [] # types = [] # for token in texts: # pos_tag = Text(token, 'en').pos_tags[0][1] # if pos_tag in n: # initial_texts.append(en.singularize(token)) # elif pos_tag in a: # initial_texts.append(WordNetLemmatizer().lemmatize(token, 'a')) # elif pos_tag in v: # initial_texts.append(en.lemma(token)) # else: # initial_texts.append(token) # types.append(pos_tag) # # texts = initial_texts # # # if self.delete_single_words: # # # self.set_words = set(texts).difference(self.set_words) # # texts = list(set(texts).difference(self.set_words)) # # self.set_words.update(set(texts)) # # return texts, types for token in texts: pos_tag = Text(token, 'en').pos_tags[0][1] if pos_tag in n: word = en.singularize(token) elif pos_tag in a: word = WordNetLemmatizer().lemmatize(token, 'a') elif pos_tag in v: word = en.lemma(token) else: word = token # print(token + " -> " + word) # if word == "ymy": # TODO remove and fix 'ymy' bug # word = 'your' word = (word, pos_tag) words.append(word) unique_words = list(set(words)) occurrences = sorted([(x[0], x[1], words.count(x)) for x in unique_words], key=lambda y: y[2], reverse=True) if self.initial_form: return [w[0] for w in words], occurrences else: return texts, occurrences
def ans_data(): print("ans_data") file = open('test.txt', 'r', encoding="UTF-8-sig") js = file.read() similar_test = json.loads(js) print(len(similar_test.keys())) file.close() data_test, name_test, id_test, t = test_data_split(get_data_test()) data_train, name_train, id_train = train_data_split(train_data_read()) data_train, material_index, material_sum, material_evg, material_th, material_count = get_material_information( data_train) material_sum, material_error = clean_data(data_train, material_index, material_sum, material_evg, material_th, material_count) similar_dec = {} for key in similar_test: temp = [] for i in range(len(similar_test[key])): if (similar_test[key][i][1] >= 0.9) & (similar_test[key][i][0] not in material_error): temp.append(similar_test[key][i][0]) similar_dec[key] = temp ans = [] for key in data_test: recipe = str(key) + "," + str(id_test[key]) for ingredient in data_test[key]: temp = [] flag = False for i in range(len(similar_dec[key])): if ingredient in data_train[similar_dec[key][i]].keys(): if data_train[similar_dec[key][i]][ingredient] != '': flag = True temp.append( int(data_train[similar_dec[key][i]][ingredient])) if flag & len(temp) != 0: data_test[key][ingredient] = np.mean(temp) # elif ingredient in material_evg.keys(): # data_test[key][ingredient] = material_evg[ingredient] else: try: data_test[key][ingredient] = material_evg[ingredient] except: data_test[key][ingredient] = "Null" # recipe += ',' + str(name_test[key][ingredient]) + "#" + str(data_test[key][ingredient]) recipe += ',' + str(ingredient) + "#" + str( data_test[key][ingredient]) ans.append(recipe) file = open('./data/recipe2.csv', 'w', encoding='utf-8') for i in range(len(ans)): s = ans[i] s = s + '\n' # 去除单引号,逗号,每行末尾追加换行符 file.write(s) file.close() print("保存文件成功") result = [] test_data = read(test_data_path) ans = readans('./data/recipe2.csv') for key in test_data: recipe = str(key) # print(key) for i in range(len(test_data[key])): # print("????????????????????????????????") print(key, test_data[key][i][0]) ingredient = test_data[key][i][0].lower() temp = re.sub('fresh|frozen|large|small|chunks', '', ingredient) temp = singularize(temp) print(temp) # print(key, ans[key][temp]) try: recipe += "," + str(test_data[key][i][0]) + "#" + str( ans[key][temp]) except: try: # recipe += "," + str(test_data[key][i][0]) + "#" + str(ans[key][name_test[key.split(",")[0]][ingredient]]) recipe += "," + str(test_data[key][i][0]) + "#" + str( ans[key][ingredient]) except: recipe += "," + str(test_data[key][i][0]) + "#" + str( ans[key][ingredient + "®"]) result.append(recipe) file = open('./data/recipe2.csv', 'w', encoding='utf-8') for i in range(len(result)): s = result[i] s = s + '\n' # 去除单引号,逗号,每行末尾追加换行符 file.write(s) file.close() print("保存文件成功")
def search(term): term = singularize(term) if term in index: return index[term], [term] else: return [], ""