def get_code(email, upload_at): user_id = select_user_id(email) if user_id == '': data = {'email': '0', 'code': '0'} final = json.dumps(data, ensure_ascii=False).encode('utf8') return final else: code = select_code(user_id) if code == '': code = utils.create_code() query = "INSERT INTO code_user (user_id, code, upload_at) VALUES (%s,%s,%s);" data = (user_id, code, upload_at) conn.run_query(query, data) data = {'email': email, 'code': utils.remove_punctuation(str(code))} final = json.dumps(data, ensure_ascii=False).encode('utf8') mservice.send_email("*****@*****.**", "Rodolfo123123", email, "Tu código es: "+str(code)) return final else: data = {'email': email, 'code': utils.remove_punctuation(str(code))} final = json.dumps(data, ensure_ascii=False).encode('utf8') mservice.send_email("*****@*****.**", "Rodolfo123123", email, "Tu código es: " + str(code)) return final
def search(paragraph): paragraph = split(lower(remove_punctuation(paragraph))) for i in topic: for j in paragraph: if i == j: return True return False
def start(paragraphs): for word in topic: s = split(remove_punctuation(lower(paragraphs))) for i in s: if i == word: return True return False
def about_helper(paragraph): paragraph = remove_punctuation(lower(paragraph)).split() # print(paragraph) for i in topic: if i in paragraph: return True return False
def select(paragraph): remove_punctuation_paragraph = remove_punctuation(paragraph) # remove punction split_paragraph_list = split(remove_punctuation_paragraph) #split paragraph for splited_paragraph in split_paragraph_list: if lower(splited_paragraph) in topic: # return a lowercased version return True return False
def is_about_topic(s): low_s = lower(remove_punctuation(s)) for _s in split(low_s): for _topic in topic: if _s == _topic: return True return False
def helper(str): str = split(remove_punctuation(lower(str))) for i in range(0, len(str)): for j in range(0, len(topic)): if (str[i] == topic[j]): return True return False
def f(x): splitted = split(x) ls = [lower(remove_punctuation(s)) for s in splitted] for item in ls: if item in topic: return True return False
def sentence_about(sentence): words = split(sentence) words = [lower(remove_punctuation(w)) for w in words] for w in words: if w in topic: return True return False
def helpler(paragraph): new_para = split(lower(remove_punctuation(paragraph))) for i in new_para: for x in topic: if i == x: return True return False
def select(paragraphs): paragraphs = lower(paragraphs) paragraphs = remove_punctuation(paragraphs) list_paragraphs = split(paragraphs) for words in topic: if words in list_paragraphs: return True return False
def update_password(email, code, password): user_id = select_user_id(email) code_consult = select_code(user_id) code_consult = utils.remove_punctuation(str(code_consult)) if code_consult == code: query = "UPDATE user SET password = %s WHERE user_id = %s;" data = (password, user_id) conn.run_query(query, data)
def calculate_word_vector_model(input_path, output_path=None): document = [] for line in open(input_path): line = utils.remove_punctuation(line) cutted_line = jieba.cut(line) document.append(list(cutted_line)) model = gensim.models.Word2Vec(document) model.save(output_path) return model
def calculate_word_vector_model(input_path,output_path = None): document = [] for line in open(input_path): line = utils.remove_punctuation(line) cutted_line = jieba.cut(line) document.append(list(cutted_line)) model = gensim.models.Word2Vec(document) model.save(output_path) return model
def f(p): p = remove_punctuation(p) p = lower(p) p = split(p) for i in p: for j in topic: if i == j: return True return False
def func(paragraph): paragraph = split(remove_punctuation(lower(paragraph))) filtered = [x for x in topic if x in paragraph] if filtered != []: return True else: return False
def is_topic_mentioned( paragraph): #parapraph is a string,choose from def choose lowered_paragraph = lower(paragraph) lowered_nopunc_paragraph = remove_punctuation(lowered_paragraph) lowered_nopunc_paragraph_list = split(lowered_nopunc_paragraph) for keyword in topic: if keyword in lowered_nopunc_paragraph_list: return True return False
def valid_topic(paragraph): paragraph = remove_punctuation(paragraph) paragraph = lower(paragraph) split_paragraph = split(paragraph) for split_words in split_paragraph: if split_words in topic: return True return False
def select(paragraph): paragraph = remove_punctuation(paragraph) paragraph = lower(paragraph) # list of all words in paragraph paragraph = split(paragraph) for word in topic: if (word in paragraph): return True return False
def make_id_for_entry(entry, style='gscholar'): """Take entry as a dict, and return an ID to use for the bib entry.""" if style != 'gscholar': raise NotImplementedError('Not implemented yet.') try: entry['title'] except KeyError: logger.info('Title entry missing. Could not create id for entry.') logger.info(entry) raise KeyError('I could not find title information from the given DOI/' 'arXiv. This often happens with books, which for some r' 'easons often do no include the title information.') try: entry['author'] entry['year'] except KeyError: # try to pull down additional information from google scholar logger.info('I could not find author/year information from DOI/arxiv,' ' attempting to pull information down from gscholar.') gscholar_result = pull_info_from_gscholar( entry['title'], accepted_fields=['author', 'year']) if 'author' in gscholar_result and 'year' in gscholar_result: logger.info('Author/year information pulled from scholar.') entry.update(gscholar_result) else: raise KeyError("author, title and year are required.") title = entry['title'] logger.info('I found the title "{}"'.format(title)) year = entry['year'] author = entry['author'].split(',')[0].lower() # extract first author if author[0] == '{': author = author[1:] if author[-1] == '}': author = author[:-1] if ' ' in author: author = author.split(' ')[0] # -- extract first word (looking at "words" with more than 3 chars) -- # gather all words in the title words_in_title = re.findall(r'\S+', title) # remove punctuation from words words_in_title = [utils.remove_punctuation(w) for w in words_in_title] # filter words with less than 3 chars words_in_title = [w for w in words_in_title if len(w) > 3] # extract first word first_word = words_in_title[0].lower() if first_word[0] == '{': first_word = first_word[1:] if first_word[-1] == '}': first_word = first_word[:-1] if '-' in first_word: first_word = first_word.split('-')[0] # build new id newid = '{}{}{}'.format(author, year, first_word) logger.info('New id for the given entry: `{}`'.format(newid)) return newid
def tag_comments_test(self, comments): comments = utils.remove_punctuation(comments) phrase_tag = set() phrase_list = comments.split(' ') for p in phrase_list: for t in self.tags_repo: match_part, if_same = wordvec.compare_phrase(p.strip(), t.strip(),self.model) if if_same: phrase_tag.add((p.strip(),t.strip())) return phrase_tag
def select(string): # need to make lowercase and remove punctuation s = remove_punctuation(string) s = lower(s) # split for comparing s = split(s) for i in topic: if i in s: return True return False
def updatePassword(email, code, password): em = selectUserID(email) code_old = selectCode(em) code_old = utils.remove_punctuation(str(code_old)) if (code_old == code): query = "UPDATE user SET password = %s WHERE user_id = %s" data = (password, em) conn.run_query(query, data)
def tag_comments_test(self, comments): comments = utils.remove_punctuation(comments) phrase_tag = set() phrase_list = comments.split(' ') for p in phrase_list: for t in self.tags_repo: match_part, if_same = wordvec.compare_phrase( p.strip(), t.strip(), self.model) if if_same: phrase_tag.add((p.strip(), t.strip())) return phrase_tag
def tag_comments_database(self, comments): comments = utils.remove_punctuation(comments) print comments phrase_tag = set() phrase_list = comments.split(' ') for p in phrase_list: for t in self.tags_repo: match_part, if_same = wordvec.compare_phrase(p, t, self.model) if if_same: phrase_tag.add(t) return phrase_tag
def __iter__(self): with open(ANT_NLP_FILE_PATH, "r", encoding="utf8") as atec: logging.info('generating word corpus, processing file %s', ANT_NLP_FILE_PATH) for line in atec: line_code, s1, s2, label = line.strip().split("\t") s1 = utils.remove_punctuation(s1) s2 = utils.remove_punctuation(s2) yield list(jieba.cut(s1)) + list(jieba.cut(s2)) for file in extract_wiki.list_all_files(PROCESSED_WIKI_FILE_PATH): logging.info('generating word corpus, processing file %s', file) with open(file, 'r', encoding="utf8") as wiki: for line in wiki: line = utils.remove_punctuation(line) if len(line) > 0: # 汉字的unicode编码范围是[0x4E00,0x9FA5] yield [ word for word in list(jieba.cut(line)) if word and 0x4E00 <= ord(word[0]) <= 0x9FA5 ]
def tag_comments_database(self, comments): comments = utils.remove_punctuation(comments) print comments phrase_tag = set() phrase_list = comments.split(' ') for p in phrase_list: for t in self.tags_repo: match_part, if_same = wordvec.compare_phrase(p, t,self.model) if if_same: phrase_tag.add(t) return phrase_tag
def __iter__(self): with open(ANT_NLP_FILE_PATH, "r", encoding="utf8") as atec: logging.info('generating char corpus, processing file %s', ANT_NLP_FILE_PATH) for line in atec: lineno, s1, s2, label = line.strip().split("\t") s1 = utils.remove_punctuation(s1) s2 = utils.remove_punctuation(s2) yield list(s1) + list(s2) for file in extract_wiki.list_all_files(PROCESSED_WIKI_FILE_PATH): logging.info('generating char corpus, processing file %s', file) with open(file, 'r', encoding="utf8") as wiki: for line in wiki: line = utils.remove_punctuation(line) if len(line) > 0: yield [ char for char in line if char and 0x4E00 <= ord(char[0]) <= 0x9FA5 ]
def train_word_vector(source,dict,wordvec): utils.jieba_add_dict(dict) comments_df = DataFrame.from_csv(source,sep = '\t') document = [] for line in comments_df['comment'].values: line = utils.remove_punctuation(line) cutted_line = jieba.cut(line) document.append(list(cutted_line)) model = gensim.models.Word2Vec(document) print 'saving word vector model' model.save(wordvec) return model
def train_word_vector(source, dict, wordvec): utils.jieba_add_dict(dict) comments_df = DataFrame.from_csv(source, sep='\t') document = [] for line in comments_df['comment'].values: line = utils.remove_punctuation(line) cutted_line = jieba.cut(line) document.append(list(cutted_line)) model = gensim.models.Word2Vec(document) print 'saving word vector model' model.save(wordvec) return model
def clean(text): text = remove_accents(text) text = expand_contractions(text) text = handle_units(text) text = convert_word_to_number(text) text = remove_punctuation(text) doc = nlp(text) text = perform_spell_check(doc) doc = nlp(text) text = convert_plural_to_singular(doc) return text
def insertCode(email, upload_at): user_id = selectUserID(email) if user_id == '': data = {'email': '0', 'code': '0'} final = final = json.dumps(data, ensure_ascii=False).encode('utf8') return final else: em = selectUserID(email) code = selectCode(em) if code == '': code = utils.createCode() query = "INSERT INTO code_user (user_id, code, upload_at) VALUES (%s,%s,%s) ;" data = (em, code, upload_at) conn.run_query(query, data) data = {'email': email, 'code': code} final = final = json.dumps(data, ensure_ascii=False).encode('utf8') mservice.sendEmail( "*****@*****.**", "Qchw-2017", email, str(code) + " is your Quechua ASR verification code") return final else: code_old = selectCode(em) data = { 'email': email, 'code': utils.remove_punctuation(str(code_old)) } final = final = json.dumps(data, ensure_ascii=False).encode('utf8') mservice.sendEmail( "*****@*****.**", "Qchw-2017", email, utils.remove_punctuation(str(code_old)) + " is your Quechua ASR verification code") return final
def main(args): """ This processes the output of fairseq-generate so that it can be scored with sacrebleu and so that it has the shared task format. """ cands = [] seen_cands = set() current_source = None for line in args.infile: tokens = line.strip().split("\t") if line.startswith("S-"): # it's hard to have fairseq pass prompt ids through the training/evaluation process # so we resort to regenerating ids based on the prompt text. # we have to be careful that the text is *exactly* the same, or the id generation will be wrong. current_source = debpe.clean( tokens[1]) if not args.no_clean else tokens[1] textID = makeID(current_source) print(f"\n{textID}{FIELDSEP}{current_source}", file=args.outfile) cands = [] seen_cands.clear() elif line.startswith("T-"): pass elif line.startswith("H-") and len(tokens) == 3 and not '-inf' in line: score = float(tokens[1]) if len(cands) == 0: top_score = score if args.threshold != 0.0: prompt_threshold = (-1.0 * args.threshold) + top_score # this is the prediction, there may be many of these. if ((args.candlimit == -1 or len(cands) < args.candlimit) and \ (args.threshold == 0.0 or score > prompt_threshold)): hyp = debpe.clean( tokens[2]) if not args.no_clean else tokens[2] hyp = hyp.lower() # remove language code if present if hyp.startswith("<") and len(hyp) >= 4 and hyp[3] == '>': hyp = hyp[5:] if args.remove_punctuation: hyp = remove_punctuation(hyp) if not hyp in seen_cands: print(hyp, file=args.outfile) cands.append(hyp) seen_cands.add(hyp)
def tag_comments(comment, keys): tags = set() comment = utils.remove_punctuation(comment) comment = comment.strip(' ') split_comment = comment.split(' ') ff = open('../data/tag_comments', 'w') for phrase in split_comment: phrase = phrase.strip(' ') for key in keys: if phrase.find(key) > -1: if len(phrase) < 7 and len(phrase) > 2: #len(phrase) 返回的是中文字的个数,不是真实长度(中文字×3) comment = comment.replace(phrase, "$%s$" % (phrase)) tags.add(phrase) result = "%s >> %s" % (comment.rstrip(), "\t".join(list(tags))) return result
def tag_comments(comment, keys): tags = set() comment = utils.remove_punctuation(comment) comment = comment.strip(" ") split_comment = comment.split(" ") ff = open("../data/tag_comments", "w") for phrase in split_comment: phrase = phrase.strip(" ") for key in keys: if phrase.find(key) > -1: if len(phrase) < 7 and len(phrase) > 2: # len(phrase) 返回的是中文字的个数,不是真实长度(中文字×3) comment = comment.replace(phrase, "$%s$" % (phrase)) tags.add(phrase) result = "%s >> %s" % (comment.rstrip(), "\t".join(list(tags))) return result
def test_remove_punctuation(self): tester = "-,.:" tester = utils.remove_punctuation(tester) self.assertTrue(tester == " ")