def tokenize_tweets(texts, segment=True, segment_vocab=None): tknzr = TweetTokenizer() token_x = [tknzr.tokenize(t) for t in texts] if not segment: return token_x # if need to segment wordsegment.load() tokens = [] for line in token_x: tokens += line counter = Counter(tokens) # identify segment-able words segmented = {} for word in counter: if word not in segment_vocab: segment = wordsegment.segment(word) if len(segment) > 1: segmented[word] = segment # reconstruct the list _token_x = [] for line in token_x: _line = [] for token in line: if token in segmented.keys(): _line += segmented[token] else: _line.append(token) _token_x.append(_line) return _token_x
def __init__(self): if not os.path.exists("index"): os.mkdir("index") else: shutil.rmtree("index") os.mkdir("index") load()
def expand_dict(): # init all_words = set(active.keys()) all_words.add("leetcode") all_words.add("yuzhouwan") # split load() for word in active.keys(): for seg in segment(word): all_words.add(seg) match = re.match(r"([a-z]+)([0-9]+)", word, re.I) if match: items = match.groups() for item in items: all_words.add(item) all_words = sorted(all_words) # build component = Et.Element("component") component.set("name", "ProjectDictionaryState") dictionary = Et.SubElement(component, "dictionary") dictionary.set("name", "yuzhouwan") words = Et.SubElement(dictionary, "words") for word in all_words: if len(word) < 2: continue Et.SubElement(words, "w").text = word data = Et.tostring(component).decode("utf-8") # write with open(".idea/dictionaries/yuzhouwan.xml", "w") as dict_xml: dict_xml.write(xml.dom.minidom.parseString(data).toprettyxml())
def build_preprocess(demojize, textify_emoji, mention_limit, punc_limit, lower_hashtag, segment_hashtag, add_cap_sign): if textify_emoji and not demojize: raise Exception("textify_emoji is meaningless without demojize") funcs = [ html.unescape, normalize_quotes, partial(tag, regex=URL_REGEX, tag=URL_TAG), partial(tag, regex=USER_REGEX, tag=USER_TAG), partial(tag, regex=NUMBER_REGEX, tag=NUMBER_TAG), ] if demojize: funcs.append(replace_emojis) if textify_emoji: funcs.append(textify_emojis) if mention_limit > 0: funcs.append(partial(limit_mentions, keep_num=mention_limit)) if punc_limit > 0: funcs.append(partial(limit_punctuations, keep_num=punc_limit)) if lower_hashtag: funcs.append(lower_hashtags) if segment_hashtag: load() funcs.append(segment_hashtags) if add_cap_sign: funcs.append(add_capital_signs) return compose(*funcs)
def calculate_wordsegment_accuracy(verbose=False): def isNotSpecicalCharacter(part): if part in ['.', ':', '_', '~']: return False else: return True # wordsegment字典载入 load() df = pd.read_csv("tmp/cheat_splitting_file.csv", header=None) identifiers = list(itertools.chain.from_iterable(df.values[34355:, 0:1])) splitted_identifiers = list( itertools.chain.from_iterable(df.values[34355:, 1:2])) lendata = len(identifiers) count = 0 for i in range(lendata): wrong_split = True splitted_identifier = (splitted_identifiers[i]).lower() parts = splitted_identifier.split('-') condition = lambda part: part not in ['.', ':', '_', '~'] parts = [x for x in filter(condition, parts)] wordsegmet_results = segment(identifiers[i]) if len(parts) == len(wordsegmet_results): difference = list(set(parts).difference(set(wordsegmet_results))) if len(difference) == 0: count = count + 1 wrong_split = False if verbose and wrong_split: print(parts) print(wordsegmet_results) print(count / lendata)
def createWordEmbeddings(inFileName, num_epochs=1): sentences = [] wordsegment.load() num_epochs = int(num_epochs) with open(inFileName, 'r') as csvfile: tweetreader = csv.reader(csvfile, delimiter='\t') for tweet in tweetreader: try: temp_segs = tweet[1].lower().strip().split() for seg in range(len(temp_segs)): if ('http' in temp_segs[seg] or '@' in temp_segs[seg]): temp_segs.pop(seg) #elif(not (wordnet.synsets(temp_segs[seg]) or temp_segs[seg] in words.words())): # temp_segs.pop(seg) temp_segs = wordsegment.segment(' '.join(temp_segs)) sentences.append(temp_segs) except Exception as e: print(e) print(tweet) continue model = Word2Vec(sentences, min_count=1, iter=num_epochs) model.save('nonOffensiveModel-NoConstraint_' + str(num_epochs) + 'epoch.bin')
def search_results(request): names = request.POST.get('search_q') if check_url(names): longitude = request.POST.get('longitude') latitude = request.POST.get('latitude') location_names = google_lookup(longitude, latitude) locations = list(map(strip_out, location_names)) load() wlist = segment(names.split('.')[0]) synlist = dict_lookup(wlist) retlist = combine_all(locations, synlist, tlds) templist = list(map(strip_tld, retlist)) returnlist = [] temp = names.split('.')[0] for entries in retlist: if SequenceMatcher(None, temp, entries).ratio() >= 0.5: returnlist.append(entries) returnlist = list(set(returnlist)) mylist = sorted(returnlist, key=lambda x: temp, reverse=False) mylist = list(map(strip_space, mylist)) finalval = check_data(mylist) newlist = verisign_mass_lookup(finalval) print(newlist) return JsonResponse({"retlist": newlist}, safe=False) else: return JsonResponse("", safe=False)
def hashtags(tweets_path, out_filename): """ Segment expression followed by hashtags. :param tweets_path: path to the file that contains tweets. :param out_filename: path to the file that contains hashtag expressions preprocessed. :return: path to the file that contains hashtag expressions preprocessed. """ print('\tHandling hashtags...') load() outfile = open(out_filename, "w+") for tweet in open(tweets_path, "r"): new_tweet = [] list_of_words = tweet.split(' ') for i in range(len(list_of_words)): word = list_of_words[i] if word[0] == '#': for w in segment(word[1:]): new_tweet.append(w) if i == len(list_of_words) - 1: new_tweet.append('\n') else: new_tweet.append(word) tweet_str = [] for i in range(len(new_tweet)): tweet_str.append(str(new_tweet[i])) if i != len(new_tweet) - 1: tweet_str.append(' ') outfile.write(''.join(tweet_str)) outfile.close() print('\t\tHashtags ok.') return out_filename
def get_segmented_text_column( self, comment_text, ): wordsegment.load() def segment_text( text, ): segmented_words = [ wordsegment.segment( text=word, ) for word in text.split() ] seperated_words = [ word for segment_text in segmented_words for word in segment_text ] segmented_text = ' '.join(seperated_words) return segmented_text segmented_text_column = comment_text.apply( lambda x: segment_text(x) ) return segmented_text_column
def detect_text(path): """Detects text in the file.""" client = vision.ImageAnnotatorClient() with io.open(path, 'rb') as image_file: content = image_file.read() image = vision.types.Image(content=content) response = client.text_detection(image=image) texts = response.text_annotations print('Texts:') load() for text in texts: print('\n"{}"'.format(text.description)) #vertices = (['({},{})'.format(vertex.x, vertex.y) #for vertex in text.bounding_poly.vertices]) #print('bounds: {}'.format(','.join(vertices))) try: segmented_words = " ".join(segment(texts[0].description)) except: segmented_words = "" return segmented_words.upper()
def __init__(self, train_data='1_train_CensoredRedditData_ratio_15.0.tsv', trained_model='NULI.pt', params_file='NULI_params.json'): #x_train, y_train, x_test, labelNum, testTweets, labelsAsNums, numsAsLabels, max_seq_length = load_dataset(train_data) wordsegment.load() # load in params params_in = open(params_file) params_lines = params_in.readlines() params = json.loads(params_lines[0]) self.labelNum = params['labelNum'] self.labelsAsNums = params['labelsAsNums'] self.numsAsLabels = params['numsAsLabels'] self.max_seq_length = params['max_seq_length'] # Load pre-trained tokenizer (vocabulary) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # load pre-trained model self.model = torch.load(trained_model)
def tokenize(string): """ Split the input several times, returning intermediate results at each level: - delimited by underscores - letter/number boundaries - word segments E.g., tokenize('landuse_austin_tx_24dates') -> ['landuse', 'land', 'use', 'austin', 'tx', '24dates', '24', 'dates'] (Don't need a token for the original string because to_tsvector splits on underscores.) """ if not wordsegment.BIGRAMS: # Should only happen in dev. wordsegment.load() lvl1_parts = string.split('_') for lvl1 in lvl1_parts: lvl2_parts = ALPHA_NUM_RE.findall(lvl1) if len(lvl2_parts) > 1: yield lvl1 for lvl2 in lvl2_parts: lvl3_parts = wordsegment.segment(lvl2) if len(lvl3_parts) > 1: yield lvl2 yield from lvl3_parts
def tokenize(desc): wordsegment.load() desc = desc.lower() ## remove punctuation desc = nltk.tokenize.WordPunctTokenizer().tokenize(desc) exclude = set(string.punctuation) desc = [''.join([c for c in ch if c not in exclude]) for ch in desc] desc = [ch for ch in desc if ch] ## word segmentor desc = wordsegment.segment(' '.join(desc)) ## remove stop words stopwords = set(nltk.corpus.stopwords.words('english')) desc = [ch for ch in desc if ch not in stopwords] ## remove integer values words = [] for ch in desc: try: int(ch) except: words.append(ch) ## Lemmatizer wordnet_lemmatizer = WordNetLemmatizer() words = [wordnet_lemmatizer.lemmatize(word) for word in words] return ' '.join(words)
def concat_data(): path = os.path.dirname(os.path.abspath(__file__)) + "/data/" with open(path + "crawled_data.pkl", "rb") as f: id2entities = pickle.load(f) ########## Lookup Tables ########## labels = list(set([entity[0] for entity in id2entities.values()])) num_classes = len(labels) label_lookup = np.zeros((num_classes, num_classes), int) np.fill_diagonal(label_lookup, 1) ################################### text_data, context_data, label_data = [], [], [] label_dict = {} for i, label in enumerate(labels): label_dict[label] = i load() tknzr = TweetTokenizer(reduce_len=True, preserve_case=False, strip_handles=False) print("Preprocessing tweets.....") for _id in id2entities: if id2entities[_id][0] in label_dict.keys(): text_data.append(text_preprocess(id2entities[_id][1], tknzr)) context_data.append(text_preprocess(id2entities[_id][2], tknzr)) label_data.append(label_lookup[label_dict[id2entities[_id][0]]]) assert len(text_data) == len(context_data) == len(label_data) return text_data, context_data, label_data
def domains_to_x_word(self, maxlen=None, n_words=50000): domains = self.domains if maxlen is None: maxlen = np.max([len(i) for i in domains]) ws.load() for i in tqdm(range(len(domains))): domain_labels = domains[i].split(".") words = list() for j in range(len(domain_labels) - 1): segs = ws.segment(domain_labels[j]) new_segs = list() for s in segs: if s in ws.UNIGRAMS: new_segs.append(s) else: new_segs += list(s) words += new_segs words.append(domain_labels[-1]) domains[i] = words x = list() for domain in domains: x.append([ text.one_hot(word, n_words, filters=" ")[0] for word in domain ]) x = sequence.pad_sequences(x, padding='post', maxlen=maxlen) self.x = x self.n_words = n_words self.maxlen = maxlen
def main(): # wordsegment load function reads and parses the unigrams and bigrams data from disk. # Loading the data only needs to be done once. load() app = connexion.App(__name__, specification_dir='./swagger/') app.app.json_encoder = encoder.JSONEncoder app.add_api('swagger.yaml', arguments={'title': 'Did You Mean API'}) app.run(port=8080)
def segment_hashtag(text: str) -> str: """ Removes hastag in front of a word and add hashtag segmentation """ text = text[1:] wordsegment.load() segments = wordsegment.segment(text) if len(segments) > 1: text = " ".join(segments) return text
def stem_sentence(sentence): stemmer = nltk.stem.RSLPStemmer() load() stemmed = "" sentence = segment(sentence) for word in sentence: stemmed += stemmer.stem(word) + ' ' return stemmed
def solve(data): res = [] load() for dic in data: ans = {} i, s = dic['id'], dic['encryptedText'] ans['id'] = i cip = CaesarCipher(s) ori = cip.cracked nn, cnt, l, ind = 0, 0, 0, 0 for x in range(len(s) - 1): for y in range(x + 1, len(s)): tmp = ori[x:y + 1] cur = y - x + 1 if tmp == tmp[::-1]: if cur > l: ind, l = x, cur nn += 1 has = [] for c in ori[ind:ind + l]: has.append(ord(c)) ans['encryptionCount'] = 0 tar = ord(s[0]) cnt = ord(ori[0]) if l == 0: for t in range(100): if cnt == tar: ans['encryptionCount'] = t break cnt += cnt if cnt > 122: cnt = (cnt - 123) % 26 + 97 else: for t in range(100): if cnt == tar: ans['encryptionCount'] = t break tmp = sum(has) + nn for i in range(len(has)): has[i] += tmp if has[i] > 122: has[i] = (has[i] - 123) % 26 + 97 cnt += tmp if cnt > 122: cnt = (cnt - 123) % 26 + 97 tmp = wordninja.split(ori) s = tmp[0] for i in range(1, len(tmp)): if tmp[i] == 'al' or tmp[i] == 'in' and tmp[ i - 1] == 'n' or tmp[i] == 'ty' and tmp[i - 1] == 'in' or tmp[ i] == 's' or tmp[i] == 'i' and tmp[i - 1] == 'a': s += tmp[i] else: s += ' ' + tmp[i] ans['originalText'] = s res.append(ans) return res
def segment_words(self, df): load() # segment combined words, such in the case of hashtags # e.g. #word1word2 to #word1 word2 df['text_preprocessed'] = df['text_preprocessed'].apply( lambda x: ' '.join(segment(x))) return df
def spell_correct(word): spell = SpellChecker() load() correct_spelling = '' words = word.split(' ') for wd in words: correct_spelling = correct_spelling + spell.correction(wd) + ' ' correct_spelling = ' '.join(segment(correct_spelling)) return correct_spelling
def fc2(inputHashTags): # check number of word segements in the given input hashtags if (len(inputHashTags) == 0): return constant.CONSTANT_INPUT_VALIDATION_ERROR elif (inputHashTags[0] == constant.CONSTANT_KEYWORD_HASHTAG and len(inputHashTags) == 1): return constant.CONSTANT_INPUT_VALIDATION_ERROR else: load() x = segment(inputHashTags[1:]) return len(x)
def __init__(self, ngrams=None): ws.load() # add unigrams to wordsegment defaults if ngrams and 'unigrams' in ngrams: for ngram, count in ngrams['unigrams'].items(): ws.UNIGRAMS[ngram] = count # add bigrams to wordsegment defaults if ngrams and 'bigrams' in ngrams: for ngram, count in ngrams['bigrams'].items(): ws.BIGRAMS[ngram] = count
def __init__(self, include_tld=True, option=DomainMatchingOption.ORDER_MATCH): ''' Just load the wordsegment package, whatever it is. ''' wordsegment.load() # Save the matching option here so we can refer to it later self.include_tld = include_tld self.option = { DomainMatchingOption.SUBSET_MATCH: set, DomainMatchingOption.ORDER_MATCH: list, }[option]
def collect_tags_and_decomposition(path): tags = pickle.load(open(path, "rb")) load() for key, value in tags.items(): cur_list = [] for v in value[0]: cur = v.split('#')[1].lower() cur = segment(cur) cur = ' '.join(cur) if cur: cur_list.append(cur) tags[key] = cur_list return tags
def bigram(words): ws.load() values = [1.] for word1, word2 in zip(words[:-1], words[1:]): try: values += [np.log10(ws.BIGRAMS[' '.join([word1, word2])])] except: values += [1.] return values
def unigram(words): ws.load() values = [] for word in words: try: values += [np.log10(ws.UNIGRAMS[word])] except: values += [1.] return values
def _initialize(): with Timer('initializing dictionary'): global initialized # takes 500ms, 100M memory wordsegment.load() # takes 900ms, 15M memory c.executemany('INSERT INTO d VALUES (?,?,?)', ((k[:3], k[3:], int(v)) for k, v in wordsegment.UNIGRAMS.items() if len(k) > 3)) # takes 200ms, 10M memory c.execute('CREATE INDEX idx on d(p, f)') conn.commit() initialized = True
def get_default_graph(): """ Convenience function to get default graph """ cfg = json.load(open('./config.json')) ws.load() nlp = spacy.load('en_core_web_md') sr_df = pd.read_csv('./low_filtered_strict.csv') grapher = Grapher() grapher.create_graph_nodes_from_ilocs(sr_df, nlp) grapher.create_id_mapping() grapher.create_fc_graph(cfg) return grapher
def check_source_package(symbol_problem_dict): global last_source global count last_source = '' comfirmed_row = [] # read package-source mapping pkg_src_dict = {} mapping_file = open('pkg_src_map.txt', 'r') mapping_lines = mapping_file.readlines() mapping_file.close() for mapping_line in mapping_lines: mapping_line = mapping_line.strip() [package, source] = mapping_line.split(' ') pkg_src_dict[package] = source # check instances for the current source package conn = sqlite3.connect('depbug.db') cur = conn.cursor() cur.execute("select * from potential_depbug order by PkgName") rows = cur.fetchall() conn.close() load() tmp = 0 for row in rows: count += 1 row_data = [x.encode('ascii') for x in row[1:]] [ PkgName, PkgVer, DepName, DepVer, LibName, LibObject, PreVer, PostVer, Direction, Severity, Symver ] = row_data if Severity == 'SymRmv': insert_depbug('depbug_confirm', row_data) insert_depbug('depbug_detect', row_data) continue if not pkg_src_dict.has_key(PkgName): continue source = pkg_src_dict[PkgName] my_key = [LibName, LibObject, PreVer, PostVer, Direction, Symver] if not symbol_problem_dict.has_key(str(my_key)): continue symbol = Symver.split('@')[0] # skip private data type if symbol in private_type: continue # break if any problem is comfirmed for problem in symbol_problem_dict[str(my_key)]: confirmed(row_data, source, symbol, problem)
from learning.pos import BackoffTagger, SpacyTagger, COCATagger from learning.tagset_conversion import TagsetConverter from learning.tree.wordnet import IndexedWordNetTree from learning.model import TreeCutModel, Grammar, GrammarTagger from pattern.en import pluralize, lexeme from misc.util import Timer # load global resources log = logging.getLogger(__name__) tag_converter = TagsetConverter() proper_noun_tags = set(BackoffTagger.proper_noun_tags()) ws.load() def new_wordnet_instance(): """ Create a new wordnet instance. This is usefult for parallel workflows. Multiple processes cannot access the same wordnet instance (as when imported globally with `from wordnet.corpus import wordnet`). This is due nltk not being thread-safe. """ return LazyCorpusLoader( 'wordnet', WordNetCorpusReader, LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8') )
import os import sys from .context import wordsegment from wordsegment import ( clean, load, main, isegment, segment, UNIGRAMS, BIGRAMS, WORDS, ) load() def test_unigrams(): assert 'test' in UNIGRAMS def test_bigrams(): assert 'in the' in BIGRAMS def test_clean(): assert clean("Can't buy me love!") == 'cantbuymelove' def test_segment_0(): result = ['choose', 'spain'] assert segment(''.join(result)) == result def test_segment_1(): result = ['this', 'is', 'a', 'test'] assert segment(''.join(result)) == result def test_segment_2(): result = [ 'when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary' ]