def parsing(self, line): line = line[:-1] # 문장 맨 뒤 '\n' 자르기 # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장 tokenizer = RegexpTokenizer("\t", gaps=True) tokens = tokenizer.tokenize(line) count = 0 for token in tokens: count += 1 if count == 1: self.label = token elif count == 2: self.operator = token elif count == 3: opnd = token tokenizer = RegexpTokenizer(",", gaps=True) opnds = tokenizer.tokenize(opnd) i = 0 for op in opnds: self.operand.append(op) i += 1 elif count == 4: self.comment = token else: print("[TokenTable.py] parsing() error")
def create_sents(toks): wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n" s = RegexpTokenizer(wordre).tokenize(toks) wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n" for sentence in s: RegexpTokenizer(wordre).tokenize(sentence) return toks
def tokenize(self, text): """ tokenize text into a list of Token objects :param text: text to be tokenized (might contains several sentences) :type text: str :return: List of Token objects :rtype: list(Token) """ tokens = [] if self.tokenizer_type == "SpaceTokenizer": operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+') for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer": operator = WhitespaceTokenizer() for counter, span in enumerate(operator.span_tokenize(text)): new_token = Token(counter, text[span[0]:span[1]], span[0], span[1]) tokens.append(new_token) elif self.tokenizer_type == "PTBTokenizer": ptb_tokens = word_tokenize(text) counter = 0 for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens): new_token = Token(counter, token, span[0], span[1]) counter += 1 tokens.append(new_token) return tokens
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile(r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.doc_mod = documents_model.DocumentsModel(opts.docs_path) self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+") self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)") self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer()
def parse(self, fname): """ Парсинг текста файла :param fname: имя файла :return: (<имя_файла>, тошнота, мошенничество) """ density, fraud = 0, 0 with codecs.open(fname, "r", encoding="utf-8") as f: text = f.read() tknz = RegexpTokenizer(pattern="[А-Яа-яA-zё]+") txt_list = tknz.tokenize(text) if txt_list: for i, word in enumerate(txt_list): new_word = self.check_word(word) if new_word: txt_list[i] = new_word fraud += 1 txt_list = [ word.lower() for word in txt_list if not (word.lower() in self.sw) ] stemmer_ru = RussianStemmer() txt_list = [ stemmer_ru.stem(token.lower()) for token in txt_list if len(token) > 1 ] dict_w = Counter(txt_list) top5 = heapq.nlargest(5, dict_w, key=dict_w.get) top5_count = sum([dict_w[word] for word in top5]) density = top5_count / len(txt_list) # такой критерий (fraud > 2) был выбран на основании тестирования на имеющейся выборке # часто попадается такое, что в объявлении есть слова типа "ШxДхВ" которые мы не можем однозначно распознать # готов обсуждать этот критерий, возможно исправить каким то образом return fname, density, fraud > 2
def get_emails_sent_by_person_list(emails_df): tokenizer = RegexpTokenizer(r'(?u)\b\w\w+\b') emails_df['subject_wc'] = emails_df['subject'].map( lambda x: len(tokenizer.tokenize(x))) emails_df['content_wc'] = emails_df['content'].map( lambda x: len(tokenizer.tokenize(x))) grouped_by_people = emails_df.groupby('from').agg({ 'content': 'count', 'subject_wc': 'mean', 'content_wc': 'mean', }) grouped_by_people.rename(columns={ 'content': 'N emails', 'subject_wc': 'Subject word count', 'content_wc': 'Content word count' }, inplace=True) grouped_by_people.sort_values(by=['N emails'], ascending=False) file_path_send = file_path = os.path.join(dir_path, 'results/emails_by_person.csv') grouped_by_people.to_csv(file_path_send)
def no_stop_tokens(self,text): tokens = [] tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)') tokens += tokenizer.tokenize(text) #stemmer = nltk.stem.snowball.EnglishStemmer() #tokens = map(lambda x: stemmer.stem(x),tokens) return tokens
def token_words(lyric): """ in: lyric(element of row['text']) take whole lyric and convert it into list of words for analysis apply few cleaning processes tot remove punctuation & stopwords & errors(Minor focus on this) return: list of words in the lyric """ lyric = lyric.lower() """ tokenizer that will tokenize lyric('text') into words without punctuation it will split aphostrophe words into 2 seperate words but its okay as most of the time words with aphostrophe are non-main verbs(would,should,etc) non-main verbs are usually insignificant in most of the context and will be deleted e.g : would've = would ve but this is fine as we know stopwords will remove ve tweetTokenizer was producing very irregular words in lyric such as (8, numbers and was dist """ #apply tokenizer tokenizer1 = RegexpTokenizer("[a-z]+") words = tokenizer1.tokenize(lyric) #convert list of stopwords to set of stopwords for faster access en_stopwords = set(stopwords.words('english')) #we remove stopwords in words #and add few words that were in the words_lyric for cleaner process en_stopwords.add('chorus') #single letters aren't really words :) for c in ascii_lowercase: en_stopwords.add(c) words_lyric = [w for w in words if not w in en_stopwords] #postProcess of words_lyric words_lyric = postProcess(words_lyric) return words_lyric
def tokenize_sentence(text, preprocess=True): ''' Tokenize the given sentence and applies preprocessing if requested (conversion to lower case and digit substitution). ''' if preprocess: text = re.sub(r'\d', '9', text.lower()) tokenizer_regexp = ur'''(?ux) ([^\W\d_]\.)+| # one letter abbreviations, e.g. E.U.A. \d{1,3}(\.\d{3})*(,\d+)| # numbers in format 999.999.999,99999 \d{1,3}(,\d{3})*(\.\d+)| # numbers in format 999,999,999.99999 \d+:\d+| # time and proportions \d+([-\\/]\d+)*| # dates. 12/03/2012 12-03-2012 [DSds][Rr][Aa]?\.| # common abbreviations such as dr., sr., sra., dra. [Mm]\.?[Ss][Cc]\.?| # M.Sc. with or without capitalization and dots [Pp][Hh]\.?[Dd]\.?| # Same for Ph.D. [^\W\d_]{1,2}\$| # currency (?:(?<=\s)|^)[\#@]\w*[A-Za-z_]+\w*| # Hashtags and twitter user names -[^\W\d_]+| # clitic pronouns with leading hyphen \w+([-']\w+)*| # words with hyphens or apostrophes, e.g. não-verbal, McDonald's -+| # any sequence of dashes \.{3,}| # ellipsis or sequences of dots \S # any non-space character ''' tokenizer = RegexpTokenizer(tokenizer_regexp) return tokenizer.tokenize(text)
def __init__(self): self.tokenize=RegexpTokenizer(r'\b([A-Za-z]+)\b') #remove the punctuations if ver==2: self.stemmer = SnowballStemmer("english") #using stemmed version of words elif ver==1: self.stemmer = LancasterStemmer() else: self.stemmer = PorterStemmer()
def emailExtractor(sentence, word): # https://stackoverflow.com/questions/39777806/how-to-update-nltk-package-so-that-it-does-not-break-email-into-3-different-toke pattern = r'\S+@[^\s.]+\.[a-zA-Z]+|\w+|[^\w\s]' tokeniser = RegexpTokenizer(pattern) for w in tokeniser.tokenize(sentence): if re.search('^(\w|\.|\_|\-)+[@](\w|\_|\-|\.)+[.]\w{2,3}$', w): context["email"] = w return True return False
def tokenize(self,text): tokens = [] tokenizer = RegexpTokenizer('(\$?\d+\.\d+)|(([\w]+-)*[\w]+)') #tokens += tokenizer.tokenize(self.title.lower()) tokens += tokenizer.tokenize(text.lower()) tokens = filter(lambda x: x not in STOP_WORDS and len(x) >1 ,tokens) #stemmer = nltk.stem.snowball.EnglishStemmer() #tokens = map(lambda x: stemmer.stem(x),tokens) return tokens
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.analyzer = self.es_int.get_index_analyzer() self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.db = MySQLdb.connect(host=constants.mysql_server, port=constants.mysql_port, user=constants.mysql_user, passwd=constants.mysql_pass, db=constants.mysql_db) self.cur = self.db.cursor() self.ttys = ['SY'] ttygroups = { "syns": ('AUN', 'EQ', 'SYN', 'MTH'), "chemicals": ('CCN', 'CSN'), "drugs": ('BD', 'BN', 'CD', 'DP', 'FBD', 'GN', 'OCD'), "diseases": ('DI', ), "findings": ('FI', ), "hierarchy": ('HS', 'HT', 'HX'), "related": ('RT', ), "preferred": ('PTN', 'PT') } self.doc_mod = documents_model.DocumentsModel(opts.anns_dir) # self.ann_client = AnnotationsClient() self.reg_apa = re.compile( # [Chen et al.2000] r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"\(\s?([^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}(,\s)?)+(\sand\s)?[^ ]+\s?[^ ]*et\sal\.,?\s\d{2,4}\)|" r"\w+\set al\. \(\d{2,4}\)") # [Chen et al. 200] self.reg_apa_rare = re.compile( r"((([A-Z]\w+\set\sal\.,? \d{4})|([A-Z]\w+\sand\s[A-Z]\w+,?\s\d{4}))((,\s)| and )?)+" ) self.reg_apa2 = re.compile( r"\(\s?(\w+\s?\w*et\sal\.,\s\d{2,4}(,\s)?)+(\sand\s)?\w+\s?\w*et\sal\.,\s\d{2,4}\)" ) self.reg_ieee = re.compile(r"(\[(\d+([,–]\s?)?)+\])|\[\s?[\d,-]+\]") self.reg_paranthesis = re.compile( r"\(\s?\d{1,2}(,\s\d{1,2})*(\sand\s\d{1,2})?\)") self.nlp_extractor = Extract_NLP_Tags() self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True) self.lmtzr = WordNetLemmatizer() self.stemmer = stem.porter.PorterStemmer()
def tokenize_large_text_file(file_name, file_location): tokens_array = set() file = os.path.join(file_location, file_name) with open(file,'r') as file: for line in file: tokenizer = RegexpTokenizer('\s+', gaps=True) tokens_array.update(tokenizer.tokenize(line)) tokens_dict = {str(file_name) + " - {} tokens".format(file_name): list(tokens_array)} with open('tokens taken from - {} - .json'.format(str(file_name)), 'w') as f: json.dump(tokens_dict, f)
def __init__(self): nltk.download("punkt") nltk.download("stopwords") nltk.download("wordnet") # lemmatization self._tokenizer = RegexpTokenizer(r"\w+") self._stop_words = set(stopwords.words("english")) # self._stemmer = nltk.stem.SnowballStemmer("english") self._lemmatizer = nltk.wordnet.WordNetLemmatizer() self._vocabulary = set()
def getTokenCount(description): tokens = RegexpTokenizer(r'\w+').tokenize(description) tokens = [w.lower() for w in tokens] stopwords = yaml.load(open("backend/nltk/stopwords.yaml", "r")) tokens = [w for w in tokens if not w in stopwords] tokens = [w for w in tokens if len(w) > 2] stemmer = PorterStemmer() tokens = [stemmer.stem(w) for w in tokens] tokenCount = collections.Counter(tokens) return tokenCount
def prep_string(s): s = re.sub("\n", " ", s) s = re.sub("\>", " ", s) #toks = Token(TEXT=s, LOC=CharSpanLocation(0, len(s), 's')) wordre = r"\w+@[\w.]+|'s|[0-9]+[.0-9]+|[0-9]+|^\w+:|([A-Z][.]+)+|(\w+[-']?)+|[.!?]|:\w*\n" toks = RegexpTokenizer(wordre).tokenize(s) word_list = [] for tok in toks: word_list.append(tok) return word_list
def getDoc_set(): tokenizer = RegexpTokenizer(r'\w+') for doc in getCorpus.corpus_doc: # print type(doc) raw = doc.lower() tokens = tokenizer.tokenize(raw) en_stop = get_stop_words("en") stopped_tokens = [i for i in tokens if i not in en_stop] p_stemmer = PorterStemmer() texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens] getCorpus.doc_set.append(texts)
def calcLocation(self, token): len = self.instTab.searchFormat(token.operator) if len > 0: return len else: if token.operator == "RESW" or token.operator == "WORD": len = 3 elif token.operator == "RESB": len = int(token.operand[0]) elif token.operator == "BYTE": len = 1 elif token.operator == "LTORG": len = self.literalTab.literalCount self.literalTab.setLiteralCount(0) count = 0 for litCheck in self.literCheck: if litCheck[1:2] == 'C': len = 3 else: len = 1 self.literalTab.modifyLiteral( litCheck, TokenTable.locCount + (count * len)) count += 1 elif token.operator == "END": len = self.literalTab.literalCount self.literalTab.setLiteralCount(0) count = 0 for litCheck in self.literCheck: self.literalTab.modifyLiteral(litCheck, token.location) count += 1 elif token.operator == "EQU": if token.operand[0] == "*": len = 0 else: tokenizer = RegexpTokenizer("-", gaps=True) tokens = tokenizer.tokenize(token.operand[0]) value1 = self.symTab.search(tokens[0]) value2 = self.symTab.search(tokens[1]) len = value1 - value2 self.symTab.modifySymbol(token.label, len) len = 0 else: len = -1 return len
def get_search_terms(search_text: str): # Get any search terms tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False) # Respect quoted strings search_terms = tr.tokenize(search_text) if len(search_terms) == 0: solr_search_terms = "*" else: solr_search_terms = ' '.join(search_terms) return solr_search_terms
def get_search_terms(request: HttpRequest): # Get any search terms tr = RegexpTokenizer('[^"\s]\S*|".+?"', gaps=False) search_text = str(request.GET.get('search_text', '')) # Respect quoted strings search_terms = tr.tokenize(search_text) if len(search_terms) == 0: solr_search_terms = "*" else: solr_search_terms = ' '.join(search_terms) return solr_search_terms
def __init__(self, corpus, tokenize_str, delimiter, n, max_length): self.corpus = corpus self.tokenizer = RegexpTokenizer(tokenize_str) self.delimiter = delimiter self.n = n self.max_length = max_length # use set methods to set these variables self.tokenized_corpus = [] self.startList = [] self.ngramDict = defaultdict(list) self.unigramDict = defaultdict(list) self.set_tokenized_corpus() self.set_ngrams()
def sentence_length(corpus): too_long_sentences = [] total_sentences = 0 tokenizer = RegexpTokenizer("\s+", gaps=True) articles = preprocessing(corpus) for article in articles: sentences = sent_tokenize(article) total_sentences += len(sentences) for sentence in sentences: words = tokenizer.tokenize(sentence) if (len(words) > 25): too_long_sentences.append((sentence, len(words))) return (1 - len(too_long_sentences) / total_sentences) * 100
def getNew_object(docs, newDoc_object): tokenizer = RegexpTokenizer(r'\w+') for doc in docs: # print type(doc[1]) raw = doc[1].lower() tokens = tokenizer.tokenize(raw) en_stop = get_stop_words("en") stopped_tokens = [i for i in tokens if i not in en_stop] p_stemmer = PorterStemmer() texts = {} # texts = [p_stemmer.stem(i).encode('utf-8') for i in stopped_tokens] # print texts for i in stopped_tokens: texts[p_stemmer.stem(i).encode('utf-8')] = texts.get( p_stemmer.stem(i).encode('utf-8'), 0) + 1 newDoc_object.append(texts)
def _getWordLists(self): # tokenize sentences #wordLists = map(lambda s: WordPunctTokenizer().tokenize(s), self.sentenceList) #wordLists = map(lambda s: PunktWordTokenizer().tokenize(s), self.sentenceList) wordLists = map(lambda s: RegexpTokenizer("\w+").tokenize(s), self.sentenceList) # remove stopwords stopWords = stopwords.words('english') wordLists = map( lambda wlist: filter(lambda w: w not in stopWords, wlist), wordLists) # use stemmer #stemmer = PorterStemmer() #wordLists = map(lambda wlist: map(lambda w: stemmer.stem(w), wlist), wordLists) return wordLists
def __init__(self, root, fields=DOC_PATTERN, sent_pattern=SENT_PATTERN, encoding='utf8', **kargs): """ :param root: corpusが入っているdir :param fields: 対象となるcorpus :param encoding: """ PlaintextCorpusReader.__init__( self, root, fields, word_tokenizer=JanomeTokenizer(), sent_tokenizer=RegexpTokenizer(sent_pattern), encoding=encoding)
def get_special_text_tokeniser(self): """ @deprecated Customised NLTK Regex Tokeniser for special cases: e.g., 3rd, 2nd,1-23-4562, 425-12-3456, wal-mart TODO: try to use solr StandardTokenizer """ ''' special_text_token_pattern=r""" (?x) # set flag to allow verbose regexps ([A-Z]\.)+ # abbreviations, e.g. U.S.A. |(\$)?\d+(\.\d+)?%?[a-zA-Z0-9]* # currency and percentages, $12.40, 50%, and mix of number and characters, 3rd, 2nd |\w+(-\w+)* # words with internal hyphens #|[a-zA-Z0-9]+ # |'s # POS |\.\.\. # ellipsis |[][.,;"'?():*\-_/\\@&'] # separate special character tokens (punctuations) """ ''' pattern = r'''(?x) (?:[A-Z]\.)+|\d+(?:\.\d+)?%?|\w+(?:[-']\w+)*|(?:[.,;"'?():*\-_/\\@&'])''' return RegexpTokenizer(pattern)
def parsing(self, line): line = line[:-1] #문장 맨 뒤 '\n' 자르기 # 매개로 들어온 line을 tab 단위로 잘라 tokens list에 저장 tokenizer = RegexpTokenizer("\t", gaps=True) tokens = tokenizer.tokenize(line) count = 1 for token in tokens : if count == 1 : self.instruction = token elif count == 2 : self.format = int(token) elif count == 3 : self.opcode = int(token, 16) elif count == 4 : self.numberOfOperand = int(token) else : print("[InstTable.py] parsing() error") count += 1
def __init__(self, args, opts): super(Method, self).__init__(args, opts) self.es_int = ESInterface(host=self.opts.server, port=self.opts.port, index_name=self.opts.index_name) self.regex_citation = re.compile( r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}(;\s)?)+\s?\)|" r"(\[(\d+([,–]\s?)?)+\])|" r"\[[\d,-]+\]").sub self.all_digits = re.compile(r"^\d+$").search if self.opts.stopwords_path: stop_path = self.opts.stopwords_path else: stop_path = STOPWORDS_PATH if self.opts.remove_stopwords: with file(self.opts.stopwords_path) as f: self.stopwords = frozenset([l.strip().lower() for l in f]) else: self.stopwords = frozenset([]) self.tokenizer = RegexpTokenizer('[^\w\-\']+', gaps=True)
def tokenizeArticle(article): # wordTokens = word_tokenize(article) tokenizer = RegexpTokenizer(r'\w+') wordTokens = tokenizer.tokenize(article) #strings with no letters pattern = re.compile("^[\W\s_0-9]+$") filteredTokens = [ token for token in wordTokens if not pattern.match(token) ] #remove punctuation filteredTokens = [ token for token in filteredTokens if not token in string.punctuation ] #remove empty strings filteredTokens = [ token for token in filteredTokens if not token == "''" and not token == '``' ] #remove numbers filteredTokens = [ token for token in filteredTokens if not is_number(token) ] #remove stopwords 30 filteredTokens = [ token for token in filteredTokens if not token in stopwords ] #remove stopwords 150 # filteredTokens = [token for token in filteredTokens if not token in stopwords[30:151]] #case folding filteredTokens = [token.lower() for token in filteredTokens] #stemming # Stemming = PorterStemmer() # filteredTokens = [Stemming.stem(token) for token in filteredTokens] #remove digits filteredTokens = [token for token in filteredTokens if not token.isdigit()] return filteredTokens