def test_simple(self): self.assertEqual(list(wordninja.split('somewords')), ['some', 'words'])
def test_digits(self): self.assertEqual(list(wordninja.split('win32intel')), ['win', '32', 'intel'])
def test_simple(self): self.assertEqual(list(wordninja.split('derekanderson')), ['derek', 'anderson'])
def preprocess_tweet(tweet): text = tweet for b1 in BUGS_1: text = text.replace(b1, ' ' + b1 + ' ') for b2 in BUGS_2: text = text.replace(b2, ' ' + b2) for b3 in BUGS_3: text = text.replace(b3, ' ') text = text.split() full_text = '' for wd in text: wd = wd.replace("'", '') emph_1 = re.findall(r'(([a-zA-Z])\2{2,})', wd) if len(emph_1) > 0: for x in emph_1: wd = wd.replace(x[0], x[1]) if wd.startswith('#') or wd.startswith('@'): wd = wd[1:] sort_1 = re.findall(r'[A-Z]{2,}', wd) for x in sort_1: if len(x) > 0: wd = wd.replace(x, ' ' + x[0] + x[1:].lower()) sort_2 = re.findall(r'[0-9]*', wd) for x in sort_2: if len(x) > 0: wd = wd.replace(x, ' ' + x + ' ') sort_3 = re.findall(r'[A-Z][^A-Z]*', wd) for x in sort_3: if len(x) > 0: wd = wd.replace(x, x + ' ') check_wd = wd.split() for cw in check_wd: def cor_names(word): list_cand = [] for n in NAMES: if word.lower().startswith(n): list_cand.append(n) if len(list_cand) > 0: fin_name = max(list_cand, key=len) if len(fin_name) > 3: return fin_name else: return word else: return word prob_name = cor_names(cw) if prob_name != cw: x = len(prob_name) try: full_text += cw[0].upper( ) + cw[1:x] + ' ' + cw[x].upper() + cw[x + 1:] + ' ' except: full_text += cw[0].upper() + cw[1:x] + ' ' else: if len(cw) > 3: split_wd = wordninja.split(cw) if len(split_wd) < len(cw): for s_wd in split_wd: full_text += check_slang(s_wd) + ' ' else: full_text += check_slang(cw) + ' ' else: full_text += check_slang(cw) + ' ' else: if len(wd) > 3: split_wd = wordninja.split(wd) if len(split_wd) < len(wd): for s_wd in split_wd: full_text += check_slang(s_wd) + ' ' else: full_text += check_slang(wd) + ' ' else: full_text += check_slang(wd) + ' ' return full_text
def slice_word(input): return wj.split(input)
def _subtask(self, s): s = s.lower() tokens = wordninja.split(s) #tokens = word_tokenize(s) pos = pos_tag(tokens) return pos
# print(news) namedata, nameclass = getnamedata() print(len(namedata)) # dataset = createVocabList(namedata) # print(len(dataset)) #2.数据预处理:训练集和测试集分割,文本特征向量化 # X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33) # 随机采样25%的数据样本作为测试集 X_train,X_test,y_train,y_test = train_test_split(namedata,nameclass,test_size=0.25) # 随机采样25%的数据样本作为测试集 xsplit_train=[] for pername in X_train: inputname ="" name = wordninja.split(pername) for i in name: inputname = inputname +" "+i xsplit_train.append(inputname) xsplit_test=[] for pername in X_test: inputname ="" name = wordninja.split(pername) for i in name: inputname = inputname +" "+i xsplit_test.append(inputname) # 文本特征向量化 vec = CountVectorizer() xvec_train = vec.fit_transform(xsplit_train)
def google(query: str, suggestion_count: int = 0) -> None: """Uses Google's search engine parser and gets the first result that shows up on a Google search. Notes: - If it is unable to get the result, Jarvis sends a request to ``suggestqueries.google.com`` - This is to rephrase the query and then looks up using the search engine parser once again. - ``suggestion_count`` is used to limit the number of times suggestions are used. - ``suggestion_count`` is also used to make sure the suggestions and parsing don't run on an infinite loop. - This happens when ``google`` gets the exact search as suggested ones which failed to fetch results earlier. Args: suggestion_count: Integer value that keeps incrementing when ``Jarvis`` looks up for suggestions. query: Takes the voice recognized statement as argument. """ results = [] try: google_results = GoogleSearch().search(query, cache=False) results = [result['titles'] for result in google_results] except NoResultsOrTrafficError: suggest_url = "https://suggestqueries.google.com/complete/search" params = { "client": "firefox", "q": query, } response = requests.get(suggest_url, params) if not response: return try: suggestion = response.json()[1][1] suggestion_count += 1 if suggestion_count >= 3: # avoids infinite suggestions over the same suggestion speaker.speak(text=response.json()[1][0].replace('=', ''), run=True) # picks the closest match and Google's it return else: google(suggestion, suggestion_count) except IndexError: return if not results: return for result in results: if len(result.split()) < 3: results.remove(result) if not results: return results = results[0:3] # picks top 3 (first appeared on Google) results.sort(key=lambda x: len(x.split()), reverse=True) # sorts in reverse by the word count of each sentence output = results[0] # picks the top most result if '\n' in output: required = output.split('\n') modify = required[0].strip() split_val = ' '.join(wordninja.split(modify.replace('.', 'rEpLaCInG'))) sentence = split_val.replace(' rEpLaCInG ', '.') repeats = [] # Captures repeated words by adding them to the empty list [repeats.append(word) for word in sentence.split() if word not in repeats] refined = ' '.join(repeats) output = refined + required[1] + '.' + required[2] output = output.replace('\\', ' or ') match_word = re.search(r'(\w{3},|\w{3}) (\d,|\d|\d{2},|\d{2}) \d{4}', output) if match_word: output = output.replace(match_word.group(), '') output = output.replace('\\', ' or ') speaker.speak(text=output, run=True)
def evaluate_bored_scribe(): data = request.get_json() logging.info("data sent for evaluation {}".format(data)) result = [{ "id": i["id"], "encryptionCount": 0, "originalText": "" } for i in data] test = [i["encryptedText"] for i in data] from codeitsuisse.bored_scribe_py import ANS ANS_ = ANS[:] _ANS = {} for i in range(len(ANS_)): ANS_[i] = "".join(ANS_[i].split(" ")) _ANS[ANS_[i]] = ANS[i] ans = [] ii = -1 for s in test: ii += 1 num_words = [0] * 26 mx = 0 mx_score = -(2**31) for i in range(26): score = 0 t = rot(s, i) # print(t, end=" ") for j in range(len(t)): if (t[j] in ["z", "q", "x"]): score -= 35 # print(score, end=" ") for j in range(len(t) - 3): if (t[j] + t[j + 1] + t[j + 2] == "the"): score += 100 if (t[j] + t[j + 1] == "is"): score += 5 if (t[j] + t[j + 1] + t[j + 2] == "are"): score += 8 if (t[j] + t[j + 1] + t[j + 2] == "was"): score += 8 if (t[j] + t[j + 1] + t[j + 2] + t[j + 3] == "were"): score += 12 if (t[j] + t[j + 1] + t[j + 2] + t[j + 3] == "have"): score += 8 if (t[j] + t[j + 1] + t[j + 2] + t[j + 3] == "from"): score += 6 if (t[j] + t[j + 1] == "to"): score += 3 if (t[j] + t[j + 1] == "of"): score += 3 if (t[j] + t[j + 1] == "th"): score += 3 if (t[j] + t[j + 1] == "er"): score += 3 if (t[j] + t[j + 1] == "on"): score += 3 if (t[j] + t[j + 1] == "in"): score += 3 if (t[j] + t[j + 1] == "at"): score += 3 if (t[j] + t[j + 1] == "an"): score += 3 if (score > mx_score): mx_score = score mx = i ans += [wordninja.split(rot(s, mx))] if ("".join(ans[-1]) in _ANS): ans[-1] = _ANS["".join(ans[-1])] else: # print(ans) i = 1 while (i < len(ans[-1])): # print(ans[-1][i]) if (len(ans[-1][i]) == 1 and ans[-1][i] != "a"): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) elif (ans[-1][i] == "re" and i + 1 < len(ans[-1])): ans[-1][i] += ans[-1][i + 1] ans[-1].pop(i + 1) i += 1 elif (ans[-1][i] == "un" and i + 1 < len(ans[-1])): ans[-1][i] += ans[-1][i + 1] ans[-1].pop(i + 1) i += 1 elif (ans[-1][i] == "im" and i + 1 < len(ans[-1])): ans[-1][i] += ans[-1][i + 1] ans[-1].pop(i + 1) i += 1 elif (ans[-1][i] == "al" and i - 1 >= 0): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) elif (ans[-1][i] == "ze" and i - 1 >= 0): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) elif (ans[-1][i] == "zed" and i - 1 >= 0): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) elif (ans[-1][i] == "ably" and i - 1 >= 0): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) elif (ans[-1][i] == "ion" and i - 1 >= 0 and ans[-1][i - 1][-1] == "t"): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) elif (ans[-1][i] == "ionate" and i - 1 >= 0 and ans[-1][i - 1][-1] == "t"): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) elif (ans[-1][i] == "able" and i - 1 >= 0 and not (ans[-1][i - 1][-1] in ["a", "e", "i", "o", "u"])): ans[-1][i - 1] += ans[-1][i] ans[-1].pop(i) else: i += 1 ans[-1] = " ".join(ans[-1]) # print(1) for i in range(len(ans)): result[i]["originalText"] = ans[i] # print(ans) for i in range(len(test)): # print("i:", i) f = "".join(ans[i].split(" ")) cnt = 0 res1, res2, res3 = l_r_palin(f) # print(res1, res2, f[res1:res2 + 1]) vis = {} while (test[i] != f): if not (f in vis): vis[f] = True else: cnt = 25 break f = rot(f, res3 + sum(ord(f[j]) for j in range(res1, res2 + 1))) # print(f) cnt += 1 result[i]["encryptionCount"] = cnt # print(3) logging.info("My result :{}".format(result)) # return json.dumps(result); return jsonify(result)
def wrapper(a, b): # j=lambda x: [z for y in x for z in y] if b == True: return list(jieba.cut(a)) else: return wj.split(a)
def recognize(image_path, weights_path, char_dict_path, ord_map_dict_path, is_vis, is_english=True): """ :param image_path: :param weights_path: :param char_dict_path: :param ord_map_dict_path: :param is_vis: :return: """ image = cv2.imread(image_path, cv2.IMREAD_COLOR) new_heigth = 32 scale_rate = new_heigth / image.shape[0] new_width = int(scale_rate * image.shape[1]) new_width = new_width if new_width > CFG.ARCH.INPUT_SIZE[0] else \ CFG.ARCH.INPUT_SIZE[0] # TODO: Fix it, force 100. new_width = 100 image = cv2.resize(image, (new_width, new_heigth), interpolation=cv2.INTER_LINEAR) image_vis = image image = np.array(image, np.float32) / 127.5 - 1.0 print(new_width, new_heigth) inputdata = tf.placeholder( dtype=tf.float32, shape=[1, new_heigth, new_width, CFG.ARCH.INPUT_CHANNELS], name='input' ) codec = tf_io_pipline_fast_tools.CrnnFeatureReader( char_dict_path=char_dict_path, ord_map_dict_path=ord_map_dict_path ) net = crnn_net.ShadowNet( phase='test', hidden_nums=CFG.ARCH.HIDDEN_UNITS, layers_nums=CFG.ARCH.HIDDEN_LAYERS, num_classes=CFG.ARCH.NUM_CLASSES ) inference_ret = net.inference( inputdata=inputdata, name='shadow_net', reuse=False ) decodes, _ = tf.nn.ctc_beam_search_decoder( inputs=inference_ret, sequence_length=int(new_width / 4) * np.ones(1), merge_repeated=True, beam_width=10 ) decode = decodes[0] print(decode) # config tf saver saver = tf.train.Saver() # config tf session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TEST.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TEST.TF_ALLOW_GROWTH sess = tf.Session(config=sess_config) with sess.as_default(): saver.restore(sess=sess, save_path=weights_path) preds = sess.run(decode, feed_dict={inputdata: [image]}) print(preds) preds = codec.sparse_tensor_to_str(preds)[0] if is_english: preds = ' '.join(wordninja.split(preds)) # return preds_evaluated input_graph_name = "input_graph.pb" output_graph_name = "output_graph.pb" export_dir = 'export' tf.train.write_graph(sess.graph, export_dir, input_graph_name) tf.logging.info("Write graph at %s." % os.path.join(export_dir, input_graph_name)) export_graph = tf.Graph() with export_graph.as_default(): freeze_graph.freeze_graph(input_graph=os.path.join(export_dir, input_graph_name), input_saver="", input_binary=False, input_checkpoint=weights_path, output_node_names='CTCBeamSearchDecoder', restore_op_name="", filename_tensor_name="", output_graph=os.path.join(export_dir, output_graph_name), clear_devices=True, initializer_nodes=None, variable_names_blacklist="") tf.logging.info("Export model at %s." % os.path.join(export_dir, output_graph_name)) logger.info('Predict image {:s} result: {:s}'.format( ops.split(image_path)[1], preds) ) if is_vis: plt.figure('CRNN Model Demo') plt.imshow(image_vis[:, :, (2, 1, 0)]) plt.show() sess.close() return
nlp = spacy.load('en_core_web_md', disable=['parser', 'tagger', 'ner']) stop_words = set(stopwords.words('english')) for index, row in dataset_merge.iterrows(): text = re.sub(r'((?mi)https?:[\w\/._-]*)', ' ', str(row.tweet)) text = re.sub(r'(?mi)\S*@\S*\s?', ' ', text) text = re.sub(r'(?mi)\S*.com\S*\s?', ' ', text) text = re.sub( r'(?mi)(can\'t|couldn\'t|should\'t|won\'t|arn\'t|wasn\'t|wern\'t|dont|cant)', 'not', text) text = re.sub(r'(?mi)[^\w#]', ' ', text) text = re.sub(r'(?mi)[#]', '', text) text = re.sub(r'(?mi)[\d]', ' ', text) doc = nlp(str(text)) doc = set([token.lemma_ for token in doc]) doc = [token.strip() for token in doc] doc = [wn.split(str(token)) for token in doc] doc = [item for subtoken in doc for item in subtoken] doc = set([token for token in doc if (len(token) > 3)]) doc = ' '.join([w.lower() for w in doc if not w in stop_words]) dataset_merge.loc[index, 'tweet'] = doc from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() corpus = [sent for sent in dataset_merge.tweet] X = cv.fit_transform(corpus).toarray() X = pd.DataFrame(X) #entire data is again divided into its train and test data as it was previously arranged with id. X_train = X.iloc[0:7920, :].values Y_train = train_label X_test = X.iloc[7920:9873, :].values
def split(word): if word in word2emb: #if True: return [word] return wordninja.split(word)
names=[] for num in df.name: num=re.sub("_"," ",num) num=re.sub("[0-9]+","",num) num=num.strip() if num.isupper(): num=num.lower() if len(num.split())>=2: if len(num.split(" ")[0])<=2: num=num.split(" ")[1] else: num=num.split(" ")[0] if len(num)>=8: b=num num=wordninja.split(num)[0] if len(num)<=2: num1=wordninja.split(num) for name in num1: num=max(num,name) names.append(num) df["clean_names"]=names df.shape[0] # In[169]: df
def convert_mm_examples_to_features(self, examples, label_list, tokenizer): label_map = {label: i for i, label in enumerate(label_list)} features = [] transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) for (ex_index, example) in enumerate(examples): hashtags = [] tokens = [] sent = example.text.split() i = 0 while i < len(sent): if sent[i] == "#" and i < len(sent) - 1: while sent[i] == "#" and i < len(sent) - 1: i += 1 if sent[i] != "#": temp = wordninja.split(sent[i]) for _ in temp: hashtags.append(_) else: if sent[i] != "#": temp = wordninja.split(sent[i]) for _ in temp: tokens.append(_) i += 1 tokens = " ".join(tokens) hashtags = " ".join(hashtags) if len(hashtags) != 0 else "None" tokens = tokenizer.tokenize(tokens) hashtags = tokenizer.tokenize(hashtags) ##### # image_text = None # image_text_dic = get_image_text() # if example.img_id in image_text_dic: # image_text = list(image_text_dic[example.img_id]) # else: # image_text = ["None"] ##### if len(tokens) > self.max_seq_length - 2: tokens = tokens[:(self.max_seq_length - 2)] if len(hashtags) > self.max_hashtag_length - 2: hashtags = hashtags[:(self.max_hashtag_length - 2)] tokens = ["[CLS]"] + tokens + ["[SEP]"] input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) added_input_mask = [1] * (len(input_ids) + 49) padding = [0] * (self.max_seq_length - len(input_ids)) input_ids += padding input_mask += padding added_input_mask += padding hashtags = ["[CLS]"] + hashtags + ["[SEP]"] hashtag_input_ids = tokenizer.convert_tokens_to_ids(hashtags) hashtag_input_mask = [1] * len(hashtag_input_ids) hashtag_padding = [0] * (self.max_hashtag_length - len(hashtag_input_ids)) hashtag_input_ids += hashtag_padding hashtag_input_mask += hashtag_padding assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(hashtag_input_ids) == self.max_hashtag_length assert len(hashtag_input_mask) == self.max_hashtag_length label_id = label_map[example.label] # process images image_name = example.img_id image_path = os.path.join(self.image_path, image_name + ".jpg") image = self.image_process(image_path, transform) # 3*224*224 features.append( MMInputFeatures(input_ids=input_ids, input_mask=input_mask, added_input_mask=added_input_mask, img_feat=image, hashtag_input_ids=hashtag_input_ids, hashtag_input_mask=hashtag_input_mask, label_id=label_id)) if ex_index % 1000 == 0: logger.info("processed image num: " + str(ex_index) + " **********") all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_added_input_mask = torch.tensor( [f.added_input_mask for f in features], dtype=torch.long) all_img_feats = torch.stack([f.img_feat for f in features]) all_hashtag_input_ids = torch.tensor( [f.hashtag_input_ids for f in features], dtype=torch.long) all_hashtag_input_mask = torch.tensor( [f.hashtag_input_mask for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) return all_input_ids, all_input_mask, all_added_input_mask, all_img_feats, all_hashtag_input_ids, all_hashtag_input_mask, all_label_ids
import re import wordninja f1 = open('D:\REST API\\result\\path-all6366531348915523614.csv','r',newline='') reader1 = csv.reader(f1) splitresult={} for row in reader1: pathresults={} for i in range(1,len(row)): resulttemp=[] hirSplitList=row[i].split('/') for hir in hirSplitList: #print(hir) if hir!='' : a = re.sub(u"\\{.*?}", "", hir) resultfir = wordninja.split(a) resulttemp.append(resultfir) pathresults[row[i]]=resulttemp splitresult[row[0]]=pathresults print(splitresult) json_str = json.dumps(splitresult, indent=4) with open('D:\REST API\\result\\pathsplithir.json', 'w') as json_file: json_file.write(json_str) # print(wordninja.split('authorized Certificates')) # print(wordninja.split('authorizedCertificates')) # print(wordninja.split('authorized-Certificates')) # print(wordninja.split('authorizedcertificates')) # print(wordninja.split('authorized/certificates'))
def recognize(image_path, weights_path, char_dict_path, ord_map_dict_path, is_vis, is_english=True): """ :param image_path: :param weights_path: :param char_dict_path: :param ord_map_dict_path: :param is_vis: :return: """ image = cv2.imread(image_path, cv2.IMREAD_COLOR) new_heigth = 32 scale_rate = new_heigth / image.shape[0] new_width = int(scale_rate * image.shape[1]) new_width = new_width if new_width > CFG.ARCH.INPUT_SIZE[ 0] else CFG.ARCH.INPUT_SIZE[0] image = cv2.resize(image, (new_width, new_heigth), interpolation=cv2.INTER_LINEAR) image_vis = image image = np.array(image, np.float32) / 127.5 - 1.0 inputdata = tf.placeholder( dtype=tf.float32, shape=[1, new_heigth, new_width, CFG.ARCH.INPUT_CHANNELS], name='input') codec = tf_io_pipline_fast_tools.CrnnFeatureReader( char_dict_path=char_dict_path, ord_map_dict_path=ord_map_dict_path) net = crnn_net.ShadowNet(phase='test', hidden_nums=CFG.ARCH.HIDDEN_UNITS, layers_nums=CFG.ARCH.HIDDEN_LAYERS, num_classes=CFG.ARCH.NUM_CLASSES) inference_ret = net.inference(inputdata=inputdata, name='shadow_net', reuse=False) decodes, _ = tf.nn.ctc_beam_search_decoder( inputs=inference_ret, sequence_length=int(new_width / 4) * np.ones(1), merge_repeated=False, beam_width=10) # config tf saver saver = tf.train.Saver() # config tf session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TEST.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TEST.TF_ALLOW_GROWTH sess = tf.Session(config=sess_config) with sess.as_default(): saver.restore(sess=sess, save_path=weights_path) preds = sess.run(decodes, feed_dict={inputdata: [image]}) preds = codec.sparse_tensor_to_str(preds[0])[0] if is_english: preds = ' '.join(wordninja.split(preds)) logger.info('Predict image {:s} result: {:s}'.format( ops.split(image_path)[1], preds)) if is_vis: plt.figure('CRNN Model Demo') plt.imshow(image_vis[:, :, (2, 1, 0)]) plt.show() sess.close() return
def preprocessing(self, onAllData=False, saveData=True): '''Function to perform the data cleaning activity, Spell checking, splitting the combined words and creating new attributes Arguments: onAllData = signifies whether to process the entire dataframe or only specific categories based on threshold value saveData = Whether to store the filtered data on the disk. ''' """ Removing the adient categories with less observations""" if onAllData: data = self.data else: data = self.filterCategories(self.threshValue) tqdm.pandas() ''' Apply the spellcheck transformation on each obervation''' print('Applying Spellcheck operation on each words.') st = time.time() data['correctedText'] = data['comments_text'].progress_apply(lambda x : ' '.join([self.spellCheck(item) for item in x.lower().replace(',', ' ').split()])) print(f'Spellcheck operation finished in {np.round((time.time()-st)/60.0, 3)} mins.') ''' Spliting the Combined words ''' print('Starting combined Word seperation operation on each records.') st = time.time() data['correctedText'] = data['correctedText'].progress_apply(lambda x : ' '.join([' '.join(wordninja.split(item)) for item in x.replace(',', ' ').split(' ')])) print(f'Operation finished in {np.round((time.time()-st)/60.0, 3)} mins.') if saveData: path = '/'.join(self.dataFilePath.split('/')[:-1]) +'/' try: data.to_excel(path + 'FilteredData.xlsx') except Exception as e: print(f'{e}\n error in storing file hence using pickle to store the file') with open(path+'FilterData.pkl', 'wb') as f: f.write(data) return data
else: Not_printed+=1; if(Not_printed>1 and Not_printed%5==0): cv2.putText(img,"NT", (10,50), font, 0.8, (0, 255, 0), 2, cv2.LINE_AA) cv2.imshow('frame',img) print("not displaying"+str(cnt)) # Display the resulting frame if cv2.waitKey(1) & 0xFF == ord('q'): break if label1=="o" and cnt>50 and len(que)>2: break; # When everything done, release the capture cap.release() cv2.destroyAllWindows() while(len(que)>0 and que[len(que)-1]=="o"): que.pop(); while(len(que)>0 and que[0]=="o"): que.pop(0); s="" for char in que: s=s+char; s1=ninja.split(s) print(s); print(*s1,sep=" ")
def word_compounds(word: str, min_length=2): return [ compound for compound in wordninja.split(word) if len(compound) >= min_length ]
def Clean_Tweets(input_text, sentiment_type): #remove b from string input_text = str(input_text) input_text = input_text[2:len(input_text) - 1] input_text = " ".join((input_text).split()) if sentiment_type == "ANEW": input_text = ' '.join([input_text]).lower() #remove @s replies = re.findall("@[\w]*", input_text) for i in replies: input_text = re.sub(i, '', input_text) #remove URLs urls = re.findall("https?://[A-Za-z0-9./]+", input_text) for i in urls: input_text = re.sub(i, '', input_text) #remove newlines a = [] if len(re.findall(r"\\n", input_text)) == 0: pass if len(re.findall(r"\\n", input_text)) > 0: for match in finditer(r'\\n', input_text): a.append(match.start()) b = (np.asarray(a)) b1 = (np.asarray(a)) + 1 alpha = np.zeros(shape=(len(a), 2)) alpha[:, 0] = b alpha[:, 1] = b1 alpha = (np.concatenate(alpha)) alpha = [int(x) for x in alpha] newline_list = list(input_text) for i in range(len(alpha)): newline_list[alpha[i]] = " " newline_list = ''.join(newline_list) newline_list = ' '.join(newline_list.split()) input_text = newline_list.strip() #convert unicode to ascii input_text = ( input_text.replace('\\xe2\\x80\\x99', "'").replace( '\\xc3\\xa9', 'e').replace('\\xe2\\x80\\x90', '-').replace( '\\xe2\\x80\\x91', '-').replace('\\xe2\\x80\\x92', '-').replace( '\\xe2\\x80\\x93', '-').replace('\\xe2\\x80\\x94', '-').replace( '\\xe2\\x80\\x94', '-').replace('\\xe2\\x80\\x98', "'").replace( '\\xe2\\x80\\x9b', "'").replace('\\xe2\\x80\\x9c', '"').replace( '\\xe2\\x80\\x9c', '"').replace('\\xe2\\x80\\x9d', '"').replace( '\\xe2\\x80\\x9e', '"').replace( '\\xe2\\x80\\x9f', '"').replace('\\xe2\\x80\\xa6', '...'). # replace('\\xe2\\x80\\xb2', "'").replace( '\\xe2\\x80\\xb3', "'").replace('\\xe2\\x80\\xb4', "'").replace( '\\xe2\\x80\\xb5', "'").replace('\\xe2\\x80\\xb6', "'").replace('\\xe2\\x80\\xb7', "'").replace( '\\xe2\\x81\\xba', "+").replace('\\xe2\\x81\\xbb', "-").replace( '\\xe2\\x81\\xbc', "=").replace( '\\xe2\\x81\\xbd', "(").replace('\\xe2\\x81\\xbe', ")")) #remove emojis a = [] if len(re.findall(r"\\x", input_text)) == 0: input_text = input_text if len(re.findall(r"\\x", input_text)) > 0: for match in finditer(r'\\x', input_text): a.append(match.start()) b = (np.asarray(a)) b1 = (np.asarray(a)) + 1 b2 = (np.asarray(a)) + 2 b3 = (np.asarray(a)) + 3 alpha = np.zeros(shape=(len(a), 4)) alpha[:, 0] = b alpha[:, 1] = b1 alpha[:, 2] = b2 alpha[:, 3] = b3 alpha = (np.concatenate(alpha)) l = [i for i in range(len(input_text))] beta = (np.in1d(l, alpha)) gamma = list(input_text) gamma_indx = [] for i in range(len(beta) - 1): if (((beta[i] == True) & (beta[i + 1] == False)) | ((beta[i - 1] == False) & (beta[i] == True))): gamma_indx.append(i) gamma[i] = " " gamma = (''.join(gamma)) res = [i for i, val in enumerate(beta) if not val] new = gamma_indx + res new.sort() string1 = [] for i in new: string1.append(gamma[i]) string1 = ''.join(string1) string1 = ' '.join(string1.split()) input_text = string1.strip() #fix contractions input_text = contractions.fix(input_text) #remove punctuation, but keep periods, question marks, and exlclamation points #replace punctuation characters with a space - ensure words remain separated punctuations = '''()-=+[]{};:"\,<>/|¦`$%^&*_~''' no_punct = "" for char in input_text: if char in punctuations: char = ' ' if char not in punctuations: no_punct = no_punct + char input_text = " ".join(no_punct.split()) #hastags input_text = tokenizer(input_text) a = [] for token in input_text: a.append(token.text) ' '.join(a) for i in range(len(a)): if ("#" in a[i]): t = word_tokenize(a[i]) if (len(t) > 1): t = ' '.join(wordninja.split(t[1])) a[i] = t else: a[i] = "" a = ' '.join(a) input_text = " ".join(a.split()) #remove numbers input_text = re.sub(r'\d+', '', input_text) #remove RT marker for retweets and any leading whitespaces input_text = (input_text.replace('RT', '')).lstrip() #replace amp with and input_text = (input_text.replace('amp', 'and')).lstrip() return (input_text)
#print("document['tokens'] = " , document['tokens'] ) #print("len(document['tokens']) = " , len(document['tokens']) ) #if i_num_tokens_max < len(document['tokens']): # i_num_tokens_max = len(document['tokens']) #if i_num_tokens_min > len(document['tokens']): # i_num_tokens_min = len(document['tokens']) #doc_complete.append( test_input_doc_text.strip('\n') ) #not_in_vocab_set = set() doc_list = [] doc_split_word_list = [] for s_word in document['tokens']: split_word_list_tmp = [] split_word_list_tmp = wordninja.split(s_word) #print("len(split_word_list_tmp) = " , len(split_word_list_tmp) ) if len(split_word_list_tmp) == 0: print("ZERO: ", s_word) sys.exit(0) for i_word in split_word_list_tmp: i_accum_vocab += 1 split_word_set.add(i_word) doc_split_word_list.append(i_word) entire_vocab_set.add(s_word) #print(s_word in english_words) if not (s_word in english_words): not_in_vocab_set.add(s_word) else:
count = 0 file_out = open("output.txt", 'w') fname = "1996_sigrams.tsv" fh = open(fname) # separate_word = list() words = list() for line in fh: # separate_word[1] = separate_word[1].replace('\n', '') # print(separate_word[1]) # file_out.write(separate_word[1]) # file_out.write('\n') slovo = line.split('\t') slovo[1] = slovo[1].replace('\n', '') if int(slovo[1]) < 2: words = wordninja.split(slovo[0]) count = count + 1 for word in words: file_out.write(word) file_out.write(' ') file_out.write('\n') else: continue count = count + 1 print(count) file_out = open("output.txt", 'r') for word, frequency in fdist_tgs.most_common(50): print(u'{};{}'.format(word, frequency))
import wordninja as wj w = wj.split(open("input.txt", "r").read()) # shall we use more? print(w)
def test_caps(self): self.assertEqual(list(wordninja.split('DEREKANDERSON')), ['DEREK', 'ANDERSON'])
dict = {} count = 0 minimum = 0 ones = 0 twos = 0 more = 0 with open('data/expiringnames.csv', newline='') as csvfile: print(csvfile) spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|') for row in spamreader: cells = row[0].split(',') name = cells[0] date = cells[1] price = float(cells[2]) words = wordninja.split(name) if price == 0.01: minimum = minimum + 1 else: count = count + 1 if len(words) == 1: ones = ones + 1 if len(words) == 2: twos = twos + 1 if len(words) > 2: more = more + 1 for word in words: if word in dict: dict[word]['price'] = dict[word]['price'] + price
def test_apostrophes(self): self.assertEqual(list(wordninja.split("that'sthesheriff'sbadge")), ["that's", "the", "sheriff's", "badge"])
def clean(sen, remove_stopwords=True, contraction=True, pun=True, lemma_=False): # re.sub(pattern, repl, string, count=0, flags=0) # pattern:表示正则表达式中的模式字符串; # repl:被替换的字符串(既可以是字符串,也可以是函数); # string:要被处理的,要被替换的字符串; # count:匹配的次数, 默认是全部替换 # flags:具体用处不详 sen = re.sub(r'\{\{(.*?)\}\}', "", sen) #catch the left over links that have no closing braces sen = re.sub(r'\{\{(.*)', "", sen) #remove the quotes that are left over, the filter sen = re.sub(r'\'+', "", sen) #remove the filenames of images but retain the title text they are called from sen = re.sub(r'(.*)\|', "", sen) sen = sen.strip(""" '!:?-_().,'"[]{};*""") sen = ' '.join( [w.strip(""" '!:?-_().,'"[]{};*""") for w in re.split(' ', sen)]) sen = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", " NUMBER ", sen) # spliting words string = [] for x in sen.split(): if len(x) > 6: for i in wordninja.split(x): #分词 if len(i) > 2: string.append(i) else: string.append(x) sen = " ".join(string) contraction new_text = [] for word in sen.split(): #切片 默认空格 # if word in contractions: # new_text.append(contractions[word]) # else: new_text.append(word) sen = " ".join(new_text) sen = re.sub(r"[^A-Za-z0-9:(),\'\`]", " ", sen) sen = re.sub(r"\b\d+\b", "", sen) #remove numbers sen = re.sub('\s+', ' ', sen) #matches any whitespace characte sen = re.sub(r'(?:^| )\w(?:$| )', ' ', sen).strip() #removing single character # Optionally, remove stop words if remove_stopwords: sen = " ".join([i for i in sen.split() if i not in stop]) # Optionally emove puncuations if pun: sen = ''.join(ch for ch in sen if ch not in exclude) # Optionally lemmatiztion if lemma_: normalized = " ".join(WordNetLemmatizer().lemmatize(word) for word in sen.split()) return sen.strip().lower() #转成小写
def mirkoPreprocessing(row,args,text_processor): SMILEY = load_dict_emoticon() tweet = row.text if(type(tweet) == float) : return tweet tweet = html.unescape(tweet) if args.normalizeNoise: tweet = str(" ".join(text_processor.pre_process_doc(tweet))) if args.doEmoticon == True: words = tweet.split() reformed = [SMILEY[word] if word in SMILEY else word for word in words] tweet = " ".join(reformed) if args.removeEmoticon == True: words = tweet.split() reformed = [" " if word in SMILEY else word for word in words] tweet = " ".join(reformed) if args.normalizeEmoticon == True: words = tweet.split() reformed = ["<emoticon>" if word in SMILEY else word for word in words] tweet = " ".join(reformed) if args.doEmoji == True: number = emoji.emoji_count(tweet) if number != 0: tweet = emoji.demojize(tweet,delimiters=("", "")) tweet = tweet.replace("_"," ") if args.removeEmoji == True: emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F" #emoticons u"\U0001F300-\U0001F5FF" #symbols & pictographs u"\U0001F680-\U0001F6FF" #transport & map symbols u"\U0001F1E0-\U0001F1FF" #flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) tweet = emoji_pattern.sub(r'', tweet) if args.normalizeEmoji == True: emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F" #emoticons u"\U0001F300-\U0001F5FF" #symbols & pictographs u"\U0001F680-\U0001F6FF" #transport & map symbols u"\U0001F1E0-\U0001F1FF" #flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) tweet = emoji_pattern.sub(r' <emoji> ', tweet) if args.removeMention == True: tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", tweet).split()) if args.normalizeMention == True: tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " @user ", tweet).split()) if args.removeUrl == True: tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split()) if args.normalizeUrl == True: tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " url ", tweet).split()) if args.rawHashtag == False: elems = [tag.strip("#") for tag in tweet.split() if tag.startswith("#")] for elem in elems: traslate = ' '.join(wordninja.split(elem)) if args.unpackHastags == True: tweet = tweet.replace("#"+elem,traslate) if args.tagAndUnpackHastags == True: tweet = tweet.replace("#"+elem,"<"+ traslate +">") if args.removeHastags == True: tweet = tweet.replace("#"+elem," ") if args.normalizeHastags == True: tweet = tweet.replace("#"+elem,"#hashtag") if args.removePunctuation: tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split()) tweet = re.sub(r"\s+", ' ', tweet) if args.doLower: return tweet.lower() else: return tweet
def test_with_a_custom_regex(self): param = 'toseparate.ornottoseparate.com' custom_regex = re.compile("[^a-zA-Z0-9\.]+") self.assertNotEqual(wordninja.split(param, re=custom_regex), wordninja.split(param))