예제 #1
0
파일: test.py 프로젝트: macunha1/wordninja
 def test_simple(self):
     self.assertEqual(list(wordninja.split('somewords')), ['some', 'words'])
예제 #2
0
파일: test.py 프로젝트: yf1291/nlp3
 def test_digits(self):
     self.assertEqual(list(wordninja.split('win32intel')),
                      ['win', '32', 'intel'])
예제 #3
0
파일: test.py 프로젝트: yf1291/nlp3
 def test_simple(self):
     self.assertEqual(list(wordninja.split('derekanderson')),
                      ['derek', 'anderson'])
예제 #4
0
def preprocess_tweet(tweet):

    text = tweet

    for b1 in BUGS_1:
        text = text.replace(b1, ' ' + b1 + ' ')
    for b2 in BUGS_2:
        text = text.replace(b2, ' ' + b2)
    for b3 in BUGS_3:
        text = text.replace(b3, ' ')

    text = text.split()

    full_text = ''

    for wd in text:

        wd = wd.replace("'", '')

        emph_1 = re.findall(r'(([a-zA-Z])\2{2,})', wd)
        if len(emph_1) > 0:
            for x in emph_1:
                wd = wd.replace(x[0], x[1])

        if wd.startswith('#') or wd.startswith('@'):
            wd = wd[1:]
            sort_1 = re.findall(r'[A-Z]{2,}', wd)
            for x in sort_1:
                if len(x) > 0:
                    wd = wd.replace(x, ' ' + x[0] + x[1:].lower())

            sort_2 = re.findall(r'[0-9]*', wd)
            for x in sort_2:
                if len(x) > 0:
                    wd = wd.replace(x, ' ' + x + ' ')

            sort_3 = re.findall(r'[A-Z][^A-Z]*', wd)
            for x in sort_3:
                if len(x) > 0:
                    wd = wd.replace(x, x + ' ')

            check_wd = wd.split()
            for cw in check_wd:

                def cor_names(word):
                    list_cand = []
                    for n in NAMES:
                        if word.lower().startswith(n):
                            list_cand.append(n)
                    if len(list_cand) > 0:
                        fin_name = max(list_cand, key=len)
                        if len(fin_name) > 3:
                            return fin_name
                        else:
                            return word

                    else:
                        return word

                prob_name = cor_names(cw)
                if prob_name != cw:
                    x = len(prob_name)
                    try:
                        full_text += cw[0].upper(
                        ) + cw[1:x] + ' ' + cw[x].upper() + cw[x + 1:] + ' '
                    except:
                        full_text += cw[0].upper() + cw[1:x] + ' '

                else:
                    if len(cw) > 3:
                        split_wd = wordninja.split(cw)
                        if len(split_wd) < len(cw):
                            for s_wd in split_wd:
                                full_text += check_slang(s_wd) + ' '
                        else:
                            full_text += check_slang(cw) + ' '
                    else:
                        full_text += check_slang(cw) + ' '
        else:
            if len(wd) > 3:
                split_wd = wordninja.split(wd)
                if len(split_wd) < len(wd):
                    for s_wd in split_wd:
                        full_text += check_slang(s_wd) + ' '
                else:
                    full_text += check_slang(wd) + ' '
            else:
                full_text += check_slang(wd) + ' '

    return full_text
예제 #5
0
def slice_word(input):
    return wj.split(input)
예제 #6
0
 def _subtask(self, s):
     s = s.lower()
     tokens = wordninja.split(s)
     #tokens = word_tokenize(s)
     pos = pos_tag(tokens)
     return pos
예제 #7
0
# print(news)
namedata, nameclass = getnamedata()
print(len(namedata))

# dataset = createVocabList(namedata)
# print(len(dataset))

#2.数据预处理:训练集和测试集分割,文本特征向量化
# X_train,X_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25,random_state=33) # 随机采样25%的数据样本作为测试集
X_train,X_test,y_train,y_test = train_test_split(namedata,nameclass,test_size=0.25) # 随机采样25%的数据样本作为测试集


xsplit_train=[]
for pername in X_train:
    inputname =""
    name =  wordninja.split(pername)
    for i in name:
        inputname = inputname +" "+i
    xsplit_train.append(inputname)
    
xsplit_test=[]
for pername in X_test:
    inputname =""
    name =  wordninja.split(pername)
    for i in name:
        inputname = inputname +" "+i
    xsplit_test.append(inputname)      

# 文本特征向量化
vec = CountVectorizer()
xvec_train = vec.fit_transform(xsplit_train)
예제 #8
0
def google(query: str, suggestion_count: int = 0) -> None:
    """Uses Google's search engine parser and gets the first result that shows up on a Google search.

    Notes:
        - If it is unable to get the result, Jarvis sends a request to ``suggestqueries.google.com``
        - This is to rephrase the query and then looks up using the search engine parser once again.
        - ``suggestion_count`` is used to limit the number of times suggestions are used.
        - ``suggestion_count`` is also used to make sure the suggestions and parsing don't run on an infinite loop.
        - This happens when ``google`` gets the exact search as suggested ones which failed to fetch results earlier.

    Args:
        suggestion_count: Integer value that keeps incrementing when ``Jarvis`` looks up for suggestions.
        query: Takes the voice recognized statement as argument.
    """
    results = []
    try:
        google_results = GoogleSearch().search(query, cache=False)
        results = [result['titles'] for result in google_results]
    except NoResultsOrTrafficError:
        suggest_url = "https://suggestqueries.google.com/complete/search"
        params = {
            "client": "firefox",
            "q": query,
        }
        response = requests.get(suggest_url, params)
        if not response:
            return
        try:
            suggestion = response.json()[1][1]
            suggestion_count += 1
            if suggestion_count >= 3:  # avoids infinite suggestions over the same suggestion
                speaker.speak(text=response.json()[1][0].replace('=', ''),
                              run=True)  # picks the closest match and Google's it
                return
            else:
                google(suggestion, suggestion_count)
        except IndexError:
            return

    if not results:
        return

    for result in results:
        if len(result.split()) < 3:
            results.remove(result)

    if not results:
        return

    results = results[0:3]  # picks top 3 (first appeared on Google)
    results.sort(key=lambda x: len(x.split()), reverse=True)  # sorts in reverse by the word count of each sentence
    output = results[0]  # picks the top most result
    if '\n' in output:
        required = output.split('\n')
        modify = required[0].strip()
        split_val = ' '.join(wordninja.split(modify.replace('.', 'rEpLaCInG')))
        sentence = split_val.replace(' rEpLaCInG ', '.')
        repeats = []  # Captures repeated words by adding them to the empty list
        [repeats.append(word) for word in sentence.split() if word not in repeats]
        refined = ' '.join(repeats)
        output = refined + required[1] + '.' + required[2]
    output = output.replace('\\', ' or ')
    match_word = re.search(r'(\w{3},|\w{3}) (\d,|\d|\d{2},|\d{2}) \d{4}', output)
    if match_word:
        output = output.replace(match_word.group(), '')
    output = output.replace('\\', ' or ')
    speaker.speak(text=output, run=True)
예제 #9
0
def evaluate_bored_scribe():
    data = request.get_json()
    logging.info("data sent for evaluation {}".format(data))

    result = [{
        "id": i["id"],
        "encryptionCount": 0,
        "originalText": ""
    } for i in data]

    test = [i["encryptedText"] for i in data]

    from codeitsuisse.bored_scribe_py import ANS
    ANS_ = ANS[:]
    _ANS = {}
    for i in range(len(ANS_)):
        ANS_[i] = "".join(ANS_[i].split(" "))
        _ANS[ANS_[i]] = ANS[i]

    ans = []
    ii = -1
    for s in test:
        ii += 1
        num_words = [0] * 26
        mx = 0
        mx_score = -(2**31)
        for i in range(26):
            score = 0
            t = rot(s, i)
            # print(t, end=" ")
            for j in range(len(t)):
                if (t[j] in ["z", "q", "x"]):
                    score -= 35
            # print(score, end=" ")
            for j in range(len(t) - 3):
                if (t[j] + t[j + 1] + t[j + 2] == "the"):
                    score += 100
                if (t[j] + t[j + 1] == "is"):
                    score += 5
                if (t[j] + t[j + 1] + t[j + 2] == "are"):
                    score += 8
                if (t[j] + t[j + 1] + t[j + 2] == "was"):
                    score += 8
                if (t[j] + t[j + 1] + t[j + 2] + t[j + 3] == "were"):
                    score += 12
                if (t[j] + t[j + 1] + t[j + 2] + t[j + 3] == "have"):
                    score += 8
                if (t[j] + t[j + 1] + t[j + 2] + t[j + 3] == "from"):
                    score += 6
                if (t[j] + t[j + 1] == "to"):
                    score += 3
                if (t[j] + t[j + 1] == "of"):
                    score += 3
                if (t[j] + t[j + 1] == "th"):
                    score += 3
                if (t[j] + t[j + 1] == "er"):
                    score += 3
                if (t[j] + t[j + 1] == "on"):
                    score += 3
                if (t[j] + t[j + 1] == "in"):
                    score += 3
                if (t[j] + t[j + 1] == "at"):
                    score += 3
                if (t[j] + t[j + 1] == "an"):
                    score += 3
            if (score > mx_score):
                mx_score = score
                mx = i
        ans += [wordninja.split(rot(s, mx))]
        if ("".join(ans[-1]) in _ANS):
            ans[-1] = _ANS["".join(ans[-1])]
        else:
            # print(ans)
            i = 1
            while (i < len(ans[-1])):
                # print(ans[-1][i])
                if (len(ans[-1][i]) == 1 and ans[-1][i] != "a"):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                elif (ans[-1][i] == "re" and i + 1 < len(ans[-1])):
                    ans[-1][i] += ans[-1][i + 1]
                    ans[-1].pop(i + 1)
                    i += 1
                elif (ans[-1][i] == "un" and i + 1 < len(ans[-1])):
                    ans[-1][i] += ans[-1][i + 1]
                    ans[-1].pop(i + 1)
                    i += 1
                elif (ans[-1][i] == "im" and i + 1 < len(ans[-1])):
                    ans[-1][i] += ans[-1][i + 1]
                    ans[-1].pop(i + 1)
                    i += 1
                elif (ans[-1][i] == "al" and i - 1 >= 0):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                elif (ans[-1][i] == "ze" and i - 1 >= 0):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                elif (ans[-1][i] == "zed" and i - 1 >= 0):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                elif (ans[-1][i] == "ably" and i - 1 >= 0):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                elif (ans[-1][i] == "ion" and i - 1 >= 0
                      and ans[-1][i - 1][-1] == "t"):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                elif (ans[-1][i] == "ionate" and i - 1 >= 0
                      and ans[-1][i - 1][-1] == "t"):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                elif (ans[-1][i] == "able" and i - 1 >= 0 and
                      not (ans[-1][i - 1][-1] in ["a", "e", "i", "o", "u"])):
                    ans[-1][i - 1] += ans[-1][i]
                    ans[-1].pop(i)
                else:
                    i += 1
            ans[-1] = " ".join(ans[-1])

    # print(1)

    for i in range(len(ans)):
        result[i]["originalText"] = ans[i]

    # print(ans)

    for i in range(len(test)):
        # print("i:", i)
        f = "".join(ans[i].split(" "))
        cnt = 0
        res1, res2, res3 = l_r_palin(f)
        # print(res1, res2, f[res1:res2 + 1])
        vis = {}
        while (test[i] != f):
            if not (f in vis):
                vis[f] = True
            else:
                cnt = 25
                break
            f = rot(f, res3 + sum(ord(f[j]) for j in range(res1, res2 + 1)))
            # print(f)
            cnt += 1
        result[i]["encryptionCount"] = cnt

    # print(3)

    logging.info("My result :{}".format(result))
    # return json.dumps(result);
    return jsonify(result)
예제 #10
0
def wrapper(a, b):
    # j=lambda x: [z for y in x for z in y]
    if b == True:
        return list(jieba.cut(a))
    else:
        return wj.split(a)
예제 #11
0
def recognize(image_path, weights_path, char_dict_path, ord_map_dict_path,
              is_vis, is_english=True):
    """

    :param image_path:
    :param weights_path:
    :param char_dict_path:
    :param ord_map_dict_path:
    :param is_vis:
    :return:
    """
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    new_heigth = 32
    scale_rate = new_heigth / image.shape[0]
    new_width = int(scale_rate * image.shape[1])
    new_width = new_width if new_width > CFG.ARCH.INPUT_SIZE[0] else \
    CFG.ARCH.INPUT_SIZE[0]
    # TODO: Fix it,  force 100.
    new_width = 100

    image = cv2.resize(image, (new_width, new_heigth),
                       interpolation=cv2.INTER_LINEAR)
    image_vis = image
    image = np.array(image, np.float32) / 127.5 - 1.0

    print(new_width, new_heigth)

    inputdata = tf.placeholder(
        dtype=tf.float32,
        shape=[1, new_heigth, new_width, CFG.ARCH.INPUT_CHANNELS],
        name='input'
    )

    codec = tf_io_pipline_fast_tools.CrnnFeatureReader(
        char_dict_path=char_dict_path,
        ord_map_dict_path=ord_map_dict_path
    )

    net = crnn_net.ShadowNet(
        phase='test',
        hidden_nums=CFG.ARCH.HIDDEN_UNITS,
        layers_nums=CFG.ARCH.HIDDEN_LAYERS,
        num_classes=CFG.ARCH.NUM_CLASSES
    )

    inference_ret = net.inference(
        inputdata=inputdata,
        name='shadow_net',
        reuse=False
    )

    decodes, _ = tf.nn.ctc_beam_search_decoder(
        inputs=inference_ret,
        sequence_length=int(new_width / 4) * np.ones(1),
        merge_repeated=True,
        beam_width=10
    )
    decode = decodes[0]

    print(decode)
    # config tf saver
    saver = tf.train.Saver()

    # config tf session
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TEST.GPU_MEMORY_FRACTION
    sess_config.gpu_options.allow_growth = CFG.TEST.TF_ALLOW_GROWTH

    sess = tf.Session(config=sess_config)

    with sess.as_default():

        saver.restore(sess=sess, save_path=weights_path)

        preds = sess.run(decode, feed_dict={inputdata: [image]})

        print(preds)

        preds = codec.sparse_tensor_to_str(preds)[0]
        if is_english:
            preds = ' '.join(wordninja.split(preds))

        # return preds_evaluated
        input_graph_name = "input_graph.pb"
        output_graph_name = "output_graph.pb"
        export_dir = 'export'
        tf.train.write_graph(sess.graph, export_dir, input_graph_name)
        tf.logging.info("Write graph at %s." % os.path.join(export_dir,
                                                            input_graph_name))

        export_graph = tf.Graph()
        with export_graph.as_default():
            freeze_graph.freeze_graph(input_graph=os.path.join(export_dir,
                                                               input_graph_name),
                                      input_saver="",
                                      input_binary=False,
                                      input_checkpoint=weights_path,
                                      output_node_names='CTCBeamSearchDecoder',
                                      restore_op_name="",
                                      filename_tensor_name="",
                                      output_graph=os.path.join(export_dir,
                                                                output_graph_name),
                                      clear_devices=True,
                                      initializer_nodes=None,
                                      variable_names_blacklist="")

        tf.logging.info("Export model at %s." % os.path.join(export_dir,
                                                             output_graph_name))


        logger.info('Predict image {:s} result: {:s}'.format(
            ops.split(image_path)[1], preds)
        )

        if is_vis:
            plt.figure('CRNN Model Demo')
            plt.imshow(image_vis[:, :, (2, 1, 0)])
            plt.show()

    sess.close()

    return
예제 #12
0
nlp = spacy.load('en_core_web_md', disable=['parser', 'tagger', 'ner'])
stop_words = set(stopwords.words('english'))
for index, row in dataset_merge.iterrows():
    text = re.sub(r'((?mi)https?:[\w\/._-]*)', ' ', str(row.tweet))
    text = re.sub(r'(?mi)\S*@\S*\s?', ' ', text)
    text = re.sub(r'(?mi)\S*.com\S*\s?', ' ', text)
    text = re.sub(
        r'(?mi)(can\'t|couldn\'t|should\'t|won\'t|arn\'t|wasn\'t|wern\'t|dont|cant)',
        'not', text)
    text = re.sub(r'(?mi)[^\w#]', ' ', text)
    text = re.sub(r'(?mi)[#]', '', text)
    text = re.sub(r'(?mi)[\d]', ' ', text)
    doc = nlp(str(text))
    doc = set([token.lemma_ for token in doc])
    doc = [token.strip() for token in doc]
    doc = [wn.split(str(token)) for token in doc]
    doc = [item for subtoken in doc for item in subtoken]
    doc = set([token for token in doc if (len(token) > 3)])
    doc = ' '.join([w.lower() for w in doc if not w in stop_words])
    dataset_merge.loc[index, 'tweet'] = doc

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
corpus = [sent for sent in dataset_merge.tweet]
X = cv.fit_transform(corpus).toarray()
X = pd.DataFrame(X)
#entire data is again divided into its train and test data as it was previously arranged with id.
X_train = X.iloc[0:7920, :].values
Y_train = train_label
X_test = X.iloc[7920:9873, :].values
예제 #13
0
 def split(word):
     if word in word2emb:
         #if True:
         return [word]
     return wordninja.split(word)
예제 #14
0
names=[]
for num in df.name:
    num=re.sub("_"," ",num) 
    num=re.sub("[0-9]+","",num)
    num=num.strip() 
    if num.isupper():
        num=num.lower()
    if len(num.split())>=2:
        if len(num.split(" ")[0])<=2:
            num=num.split(" ")[1]
        else:
            num=num.split(" ")[0]
    if len(num)>=8:
        b=num
        num=wordninja.split(num)[0]
        if len(num)<=2:
            num1=wordninja.split(num)
            for name in num1:
                num=max(num,name)
             
        
    names.append(num)   
df["clean_names"]=names
df.shape[0]


# In[169]:


df
예제 #15
0
    def convert_mm_examples_to_features(self, examples, label_list, tokenizer):
        label_map = {label: i for i, label in enumerate(label_list)}
        features = []

        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        for (ex_index, example) in enumerate(examples):

            hashtags = []
            tokens = []

            sent = example.text.split()
            i = 0
            while i < len(sent):
                if sent[i] == "#" and i < len(sent) - 1:
                    while sent[i] == "#" and i < len(sent) - 1:
                        i += 1
                    if sent[i] != "#":
                        temp = wordninja.split(sent[i])
                        for _ in temp:
                            hashtags.append(_)
                else:
                    if sent[i] != "#":
                        temp = wordninja.split(sent[i])
                        for _ in temp:
                            tokens.append(_)
                    i += 1
            tokens = " ".join(tokens)
            hashtags = " ".join(hashtags) if len(hashtags) != 0 else "None"
            tokens = tokenizer.tokenize(tokens)
            hashtags = tokenizer.tokenize(hashtags)

            #####
            # image_text = None
            # image_text_dic = get_image_text()

            # if example.img_id in image_text_dic:
            #     image_text = list(image_text_dic[example.img_id])
            # else:
            #     image_text = ["None"]
            #####

            if len(tokens) > self.max_seq_length - 2:
                tokens = tokens[:(self.max_seq_length - 2)]
            if len(hashtags) > self.max_hashtag_length - 2:
                hashtags = hashtags[:(self.max_hashtag_length - 2)]

            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            added_input_mask = [1] * (len(input_ids) + 49)
            padding = [0] * (self.max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            added_input_mask += padding

            hashtags = ["[CLS]"] + hashtags + ["[SEP]"]
            hashtag_input_ids = tokenizer.convert_tokens_to_ids(hashtags)
            hashtag_input_mask = [1] * len(hashtag_input_ids)
            hashtag_padding = [0] * (self.max_hashtag_length -
                                     len(hashtag_input_ids))
            hashtag_input_ids += hashtag_padding
            hashtag_input_mask += hashtag_padding
            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length

            assert len(hashtag_input_ids) == self.max_hashtag_length
            assert len(hashtag_input_mask) == self.max_hashtag_length
            label_id = label_map[example.label]

            # process images
            image_name = example.img_id
            image_path = os.path.join(self.image_path, image_name + ".jpg")
            image = self.image_process(image_path, transform)  # 3*224*224

            features.append(
                MMInputFeatures(input_ids=input_ids,
                                input_mask=input_mask,
                                added_input_mask=added_input_mask,
                                img_feat=image,
                                hashtag_input_ids=hashtag_input_ids,
                                hashtag_input_mask=hashtag_input_mask,
                                label_id=label_id))
            if ex_index % 1000 == 0:
                logger.info("processed image num: " + str(ex_index) +
                            " **********")
        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_added_input_mask = torch.tensor(
            [f.added_input_mask for f in features], dtype=torch.long)
        all_img_feats = torch.stack([f.img_feat for f in features])
        all_hashtag_input_ids = torch.tensor(
            [f.hashtag_input_ids for f in features], dtype=torch.long)
        all_hashtag_input_mask = torch.tensor(
            [f.hashtag_input_mask for f in features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
        return all_input_ids, all_input_mask, all_added_input_mask, all_img_feats, all_hashtag_input_ids, all_hashtag_input_mask, all_label_ids
예제 #16
0
import re

import wordninja

f1 = open('D:\REST API\\result\\path-all6366531348915523614.csv','r',newline='')
reader1 = csv.reader(f1)
splitresult={}
for row in reader1:
    pathresults={}
    for i in range(1,len(row)):
        resulttemp=[]
        hirSplitList=row[i].split('/')
        for hir in hirSplitList:
            #print(hir)
            if hir!='' :
                a = re.sub(u"\\{.*?}", "", hir)
                resultfir = wordninja.split(a)
                resulttemp.append(resultfir)

        pathresults[row[i]]=resulttemp
    splitresult[row[0]]=pathresults

print(splitresult)
json_str = json.dumps(splitresult, indent=4)
with open('D:\REST API\\result\\pathsplithir.json', 'w') as json_file:
    json_file.write(json_str)
# print(wordninja.split('authorized Certificates'))
# print(wordninja.split('authorizedCertificates'))
# print(wordninja.split('authorized-Certificates'))
# print(wordninja.split('authorizedcertificates'))
# print(wordninja.split('authorized/certificates'))
예제 #17
0
def recognize(image_path,
              weights_path,
              char_dict_path,
              ord_map_dict_path,
              is_vis,
              is_english=True):
    """

    :param image_path:
    :param weights_path:
    :param char_dict_path:
    :param ord_map_dict_path:
    :param is_vis:
    :return:
    """
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    new_heigth = 32
    scale_rate = new_heigth / image.shape[0]
    new_width = int(scale_rate * image.shape[1])
    new_width = new_width if new_width > CFG.ARCH.INPUT_SIZE[
        0] else CFG.ARCH.INPUT_SIZE[0]
    image = cv2.resize(image, (new_width, new_heigth),
                       interpolation=cv2.INTER_LINEAR)
    image_vis = image
    image = np.array(image, np.float32) / 127.5 - 1.0

    inputdata = tf.placeholder(
        dtype=tf.float32,
        shape=[1, new_heigth, new_width, CFG.ARCH.INPUT_CHANNELS],
        name='input')

    codec = tf_io_pipline_fast_tools.CrnnFeatureReader(
        char_dict_path=char_dict_path, ord_map_dict_path=ord_map_dict_path)

    net = crnn_net.ShadowNet(phase='test',
                             hidden_nums=CFG.ARCH.HIDDEN_UNITS,
                             layers_nums=CFG.ARCH.HIDDEN_LAYERS,
                             num_classes=CFG.ARCH.NUM_CLASSES)

    inference_ret = net.inference(inputdata=inputdata,
                                  name='shadow_net',
                                  reuse=False)

    decodes, _ = tf.nn.ctc_beam_search_decoder(
        inputs=inference_ret,
        sequence_length=int(new_width / 4) * np.ones(1),
        merge_repeated=False,
        beam_width=10)

    # config tf saver
    saver = tf.train.Saver()

    # config tf session
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TEST.GPU_MEMORY_FRACTION
    sess_config.gpu_options.allow_growth = CFG.TEST.TF_ALLOW_GROWTH

    sess = tf.Session(config=sess_config)

    with sess.as_default():

        saver.restore(sess=sess, save_path=weights_path)

        preds = sess.run(decodes, feed_dict={inputdata: [image]})

        preds = codec.sparse_tensor_to_str(preds[0])[0]
        if is_english:
            preds = ' '.join(wordninja.split(preds))

        logger.info('Predict image {:s} result: {:s}'.format(
            ops.split(image_path)[1], preds))

        if is_vis:
            plt.figure('CRNN Model Demo')
            plt.imshow(image_vis[:, :, (2, 1, 0)])
            plt.show()

    sess.close()

    return
예제 #18
0
 def preprocessing(self, onAllData=False, saveData=True):
     '''Function to perform the data cleaning activity, Spell checking, 
     splitting the combined words and creating new attributes
     
     Arguments:
     onAllData = signifies whether to process the entire dataframe or 
                 only specific categories  based on threshold value
     
     saveData = Whether to store the filtered data on the disk.
     
     '''
     """ Removing the adient categories with less observations"""
     if onAllData:
         data = self.data
     
     else:
         data = self.filterCategories(self.threshValue)  
     
     tqdm.pandas()
     ''' Apply the spellcheck transformation on each obervation'''
     print('Applying Spellcheck operation on each words.')
     st = time.time()
     data['correctedText'] = data['comments_text'].progress_apply(lambda x : ' '.join([self.spellCheck(item) for item in x.lower().replace(',', ' ').split()]))
     print(f'Spellcheck operation finished in {np.round((time.time()-st)/60.0, 3)} mins.')
     
     
     ''' Spliting the Combined words '''
     print('Starting combined Word seperation operation on each records.')
     st = time.time()
     data['correctedText'] = data['correctedText'].progress_apply(lambda x : ' '.join([' '.join(wordninja.split(item)) for item in x.replace(',', ' ').split(' ')]))
     print(f'Operation finished in {np.round((time.time()-st)/60.0, 3)} mins.')
     
     if saveData:
         path = '/'.join(self.dataFilePath.split('/')[:-1]) +'/'
         try:
             data.to_excel(path + 'FilteredData.xlsx')
         except Exception as e:
             print(f'{e}\n error in storing file hence using pickle to store the file')
             with open(path+'FilterData.pkl', 'wb') as f:
                 f.write(data)
         
     return data
예제 #19
0
    else: 
      Not_printed+=1;

    if(Not_printed>1 and Not_printed%5==0):
      cv2.putText(img,"NT", (10,50), font, 0.8, (0, 255, 0), 2, cv2.LINE_AA)
      cv2.imshow('frame',img)
      print("not displaying"+str(cnt))
    # Display the resulting frame
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    if label1=="o" and cnt>50 and len(que)>2:
        break;

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()
while(len(que)>0 and que[len(que)-1]=="o"):
  que.pop();
while(len(que)>0 and que[0]=="o"):
  que.pop(0);

s=""
for char in que:
    s=s+char;
s1=ninja.split(s)
print(s);

print(*s1,sep=" ")

예제 #20
0
def word_compounds(word: str, min_length=2):
    return [
        compound for compound in wordninja.split(word)
        if len(compound) >= min_length
    ]
def Clean_Tweets(input_text, sentiment_type):

    #remove b from string
    input_text = str(input_text)
    input_text = input_text[2:len(input_text) - 1]
    input_text = " ".join((input_text).split())

    if sentiment_type == "ANEW":
        input_text = ' '.join([input_text]).lower()

    #remove @s
    replies = re.findall("@[\w]*", input_text)
    for i in replies:
        input_text = re.sub(i, '', input_text)

    #remove URLs
    urls = re.findall("https?://[A-Za-z0-9./]+", input_text)
    for i in urls:
        input_text = re.sub(i, '', input_text)

    #remove newlines
    a = []
    if len(re.findall(r"\\n", input_text)) == 0:
        pass
    if len(re.findall(r"\\n", input_text)) > 0:
        for match in finditer(r'\\n', input_text):
            a.append(match.start())
            b = (np.asarray(a))
            b1 = (np.asarray(a)) + 1
            alpha = np.zeros(shape=(len(a), 2))
            alpha[:, 0] = b
            alpha[:, 1] = b1
            alpha = (np.concatenate(alpha))
            alpha = [int(x) for x in alpha]
            newline_list = list(input_text)

        for i in range(len(alpha)):
            newline_list[alpha[i]] = " "
        newline_list = ''.join(newline_list)
        newline_list = ' '.join(newline_list.split())
        input_text = newline_list.strip()

    #convert unicode to ascii
    input_text = (
        input_text.replace('\\xe2\\x80\\x99', "'").replace(
            '\\xc3\\xa9', 'e').replace('\\xe2\\x80\\x90', '-').replace(
                '\\xe2\\x80\\x91',
                '-').replace('\\xe2\\x80\\x92', '-').replace(
                    '\\xe2\\x80\\x93',
                    '-').replace('\\xe2\\x80\\x94', '-').replace(
                        '\\xe2\\x80\\x94',
                        '-').replace('\\xe2\\x80\\x98', "'").replace(
                            '\\xe2\\x80\\x9b',
                            "'").replace('\\xe2\\x80\\x9c', '"').replace(
                                '\\xe2\\x80\\x9c',
                                '"').replace('\\xe2\\x80\\x9d', '"').replace(
                                    '\\xe2\\x80\\x9e', '"').replace(
                                        '\\xe2\\x80\\x9f',
                                        '"').replace('\\xe2\\x80\\xa6',
                                                     '...').  #
        replace('\\xe2\\x80\\xb2', "'").replace(
            '\\xe2\\x80\\xb3', "'").replace('\\xe2\\x80\\xb4', "'").replace(
                '\\xe2\\x80\\xb5',
                "'").replace('\\xe2\\x80\\xb6',
                             "'").replace('\\xe2\\x80\\xb7', "'").replace(
                                 '\\xe2\\x81\\xba',
                                 "+").replace('\\xe2\\x81\\xbb', "-").replace(
                                     '\\xe2\\x81\\xbc', "=").replace(
                                         '\\xe2\\x81\\xbd',
                                         "(").replace('\\xe2\\x81\\xbe', ")"))

    #remove emojis
    a = []
    if len(re.findall(r"\\x", input_text)) == 0:
        input_text = input_text
    if len(re.findall(r"\\x", input_text)) > 0:
        for match in finditer(r'\\x', input_text):
            a.append(match.start())
        b = (np.asarray(a))
        b1 = (np.asarray(a)) + 1
        b2 = (np.asarray(a)) + 2
        b3 = (np.asarray(a)) + 3

        alpha = np.zeros(shape=(len(a), 4))
        alpha[:, 0] = b
        alpha[:, 1] = b1
        alpha[:, 2] = b2
        alpha[:, 3] = b3

        alpha = (np.concatenate(alpha))

        l = [i for i in range(len(input_text))]
        beta = (np.in1d(l, alpha))
        gamma = list(input_text)
        gamma_indx = []
        for i in range(len(beta) - 1):
            if (((beta[i] == True) & (beta[i + 1] == False)) |
                ((beta[i - 1] == False) & (beta[i] == True))):
                gamma_indx.append(i)
                gamma[i] = " "
        gamma = (''.join(gamma))

        res = [i for i, val in enumerate(beta) if not val]
        new = gamma_indx + res
        new.sort()
        string1 = []
        for i in new:
            string1.append(gamma[i])
        string1 = ''.join(string1)
        string1 = ' '.join(string1.split())
        input_text = string1.strip()

    #fix contractions
    input_text = contractions.fix(input_text)

    #remove punctuation, but keep periods, question marks, and exlclamation points
    #replace punctuation characters with a space - ensure words remain separated
    punctuations = '''()-=+[]{};:"\,<>/|¦`$%^&*_~'''
    no_punct = ""
    for char in input_text:
        if char in punctuations:
            char = ' '
        if char not in punctuations:
            no_punct = no_punct + char
    input_text = " ".join(no_punct.split())

    #hastags
    input_text = tokenizer(input_text)
    a = []
    for token in input_text:
        a.append(token.text)
        ' '.join(a)
    for i in range(len(a)):
        if ("#" in a[i]):
            t = word_tokenize(a[i])
            if (len(t) > 1):
                t = ' '.join(wordninja.split(t[1]))
                a[i] = t
            else:
                a[i] = ""
    a = ' '.join(a)
    input_text = " ".join(a.split())

    #remove numbers
    input_text = re.sub(r'\d+', '', input_text)

    #remove RT marker for retweets and any leading whitespaces
    input_text = (input_text.replace('RT', '')).lstrip()

    #replace amp with and
    input_text = (input_text.replace('amp', 'and')).lstrip()

    return (input_text)
예제 #22
0
    #print("document['tokens'] = " , document['tokens'] )
    #print("len(document['tokens']) = " , len(document['tokens']) )
    #if i_num_tokens_max < len(document['tokens']):
    #    i_num_tokens_max = len(document['tokens'])
    #if i_num_tokens_min > len(document['tokens']):
    #    i_num_tokens_min = len(document['tokens'])

    #doc_complete.append( test_input_doc_text.strip('\n') )

    #not_in_vocab_set = set()
    doc_list = []
    doc_split_word_list = []
    for s_word in document['tokens']:

        split_word_list_tmp = []
        split_word_list_tmp = wordninja.split(s_word)
        #print("len(split_word_list_tmp) = " , len(split_word_list_tmp) )
        if len(split_word_list_tmp) == 0:
            print("ZERO: ", s_word)
            sys.exit(0)
        for i_word in split_word_list_tmp:
            i_accum_vocab += 1
            split_word_set.add(i_word)
            doc_split_word_list.append(i_word)

        entire_vocab_set.add(s_word)

        #print(s_word in english_words)
        if not (s_word in english_words):
            not_in_vocab_set.add(s_word)
        else:
예제 #23
0
count = 0
file_out = open("output.txt", 'w')
fname = "1996_sigrams.tsv"
fh = open(fname)
# separate_word = list()
words = list()
for line in fh:
    # separate_word[1] = separate_word[1].replace('\n', '')
    # print(separate_word[1])
    # file_out.write(separate_word[1])
    # file_out.write('\n')
    slovo = line.split('\t')
    slovo[1] = slovo[1].replace('\n', '')
    if int(slovo[1]) < 2:
        words = wordninja.split(slovo[0])
        count = count + 1
        for word in words:
            file_out.write(word)
            file_out.write(' ')
            file_out.write('\n')
    else:
        continue

count = count + 1
print(count)

file_out = open("output.txt", 'r')
for word, frequency in fdist_tgs.most_common(50):
    print(u'{};{}'.format(word, frequency))
예제 #24
0
import wordninja as wj
w = wj.split(open("input.txt", "r").read())
# shall we use more?
print(w)
예제 #25
0
파일: test.py 프로젝트: yf1291/nlp3
 def test_caps(self):
     self.assertEqual(list(wordninja.split('DEREKANDERSON')),
                      ['DEREK', 'ANDERSON'])
예제 #26
0
dict = {}
count = 0
minimum = 0
ones = 0
twos = 0
more = 0
with open('data/expiringnames.csv', newline='') as csvfile:
    print(csvfile)
    spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in spamreader:
        cells = row[0].split(',')
        name = cells[0]
        date = cells[1]
        price = float(cells[2])
        words = wordninja.split(name)
        if price == 0.01:
            minimum = minimum + 1
        else:
            count = count + 1

        if len(words) == 1:
            ones = ones + 1
        if len(words) == 2:
            twos = twos + 1
        if len(words) > 2:
            more = more + 1

        for word in words:
            if word in dict:
                dict[word]['price'] = dict[word]['price'] + price
예제 #27
0
파일: test.py 프로젝트: yf1291/nlp3
 def test_apostrophes(self):
     self.assertEqual(list(wordninja.split("that'sthesheriff'sbadge")),
                      ["that's", "the", "sheriff's", "badge"])
예제 #28
0
def clean(sen,
          remove_stopwords=True,
          contraction=True,
          pun=True,
          lemma_=False):
    #     re.sub(pattern, repl, string, count=0, flags=0)

    # pattern:表示正则表达式中的模式字符串;

    # repl:被替换的字符串(既可以是字符串,也可以是函数);

    # string:要被处理的,要被替换的字符串;

    # count:匹配的次数, 默认是全部替换

    # flags:具体用处不详

    sen = re.sub(r'\{\{(.*?)\}\}', "", sen)
    #catch the left over links that have no closing braces
    sen = re.sub(r'\{\{(.*)', "", sen)
    #remove the quotes that are left over, the filter
    sen = re.sub(r'\'+', "", sen)
    #remove the filenames of images but retain the title text they are called from
    sen = re.sub(r'(.*)\|', "", sen)
    sen = sen.strip(""" '!:?-_().,'"[]{};*""")

    sen = ' '.join(
        [w.strip(""" '!:?-_().,'"[]{};*""") for w in re.split(' ', sen)])

    sen = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", " NUMBER ", sen)

    # spliting words
    string = []
    for x in sen.split():
        if len(x) > 6:
            for i in wordninja.split(x):  #分词
                if len(i) > 2:
                    string.append(i)
        else:
            string.append(x)
    sen = " ".join(string)

    contraction
    new_text = []
    for word in sen.split():  #切片 默认空格

        # if word in contractions:
        #     new_text.append(contractions[word])
        # else:

        new_text.append(word)
    sen = " ".join(new_text)

    sen = re.sub(r"[^A-Za-z0-9:(),\'\`]", " ", sen)
    sen = re.sub(r"\b\d+\b", "", sen)  #remove numbers
    sen = re.sub('\s+', ' ', sen)  #matches any whitespace characte
    sen = re.sub(r'(?:^| )\w(?:$| )', ' ',
                 sen).strip()  #removing single character

    # Optionally, remove stop words
    if remove_stopwords:
        sen = " ".join([i for i in sen.split() if i not in stop])

    # Optionally emove puncuations
    if pun:
        sen = ''.join(ch for ch in sen if ch not in exclude)

    # Optionally lemmatiztion
    if lemma_:
        normalized = " ".join(WordNetLemmatizer().lemmatize(word)
                              for word in sen.split())

    return sen.strip().lower()  #转成小写
예제 #29
0
def mirkoPreprocessing(row,args,text_processor):
  SMILEY = load_dict_emoticon()  
  tweet = row.text
  if(type(tweet) == float) :
  	return tweet
  tweet = html.unescape(tweet)

  if args.normalizeNoise:
    tweet = str(" ".join(text_processor.pre_process_doc(tweet)))

  if args.doEmoticon == True:
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)

  if args.removeEmoticon == True:
    words = tweet.split()
    reformed = [" " if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)

  if args.normalizeEmoticon == True:
    words = tweet.split()
    reformed = ["<emoticon>" if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)
    
  if args.doEmoji == True:
    number = emoji.emoji_count(tweet)
    if number != 0:
    	tweet = emoji.demojize(tweet,delimiters=("", ""))
    	tweet = tweet.replace("_"," ")
  if args.removeEmoji == True:
    emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"  #emoticons 
                               u"\U0001F300-\U0001F5FF"     #symbols & pictographs
                               u"\U0001F680-\U0001F6FF"     #transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"     #flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    tweet = emoji_pattern.sub(r'', tweet)
  if args.normalizeEmoji == True:
    emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F" #emoticons 
                               u"\U0001F300-\U0001F5FF"    #symbols & pictographs
                               u"\U0001F680-\U0001F6FF"    #transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"    #flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    tweet = emoji_pattern.sub(r' <emoji> ', tweet)
  if args.removeMention == True:
      tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", tweet).split())
  if args.normalizeMention == True:
      tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " @user ", tweet).split())
  if args.removeUrl == True:
  	tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
  if args.normalizeUrl == True:
      tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " url ", tweet).split())
  if args.rawHashtag == False:
    elems = [tag.strip("#") for tag in tweet.split() if tag.startswith("#")]
    for elem in elems:
        traslate = ' '.join(wordninja.split(elem))
        if args.unpackHastags == True:
            tweet = tweet.replace("#"+elem,traslate)
        if args.tagAndUnpackHastags == True:
            tweet = tweet.replace("#"+elem,"<"+ traslate +">")
        if args.removeHastags == True:
            tweet = tweet.replace("#"+elem," ")
        if args.normalizeHastags == True:
            tweet = tweet.replace("#"+elem,"#hashtag")
  if args.removePunctuation:
    tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=]", " ", tweet).split())
  tweet = re.sub(r"\s+", ' ', tweet)   
  if args.doLower:
  	return tweet.lower()
  else:
    return tweet
예제 #30
0
파일: test.py 프로젝트: macunha1/wordninja
 def test_with_a_custom_regex(self):
     param = 'toseparate.ornottoseparate.com'
     custom_regex = re.compile("[^a-zA-Z0-9\.]+")
     self.assertNotEqual(wordninja.split(param, re=custom_regex),
                         wordninja.split(param))