def index(client, freq_file, lang): tweets = client['twitter_'+lang]['tweets'] # freq_dict = load(freq_file) freq_dict = defaultdict(int) i = 0 for tweet in tweets.find(): i += 1 if i % 100000 == 0: print i # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}}) # text = tweet['text'].lower() text = tweet['text'] text = re.sub(filter_pattern, '', text) for sent in split_multi(text): for word in word_tokenizer(sent): freq_dict[word] += 1 # for the second db (tr) tweets = client['new_'+lang]['tweets'] for tweet in tweets.find(): i += 1 if i % 100000 == 0: print i # tweets.update({'_id': tweet['_id']}, {'$set': {'indexed': True}}) # text = tweet['text'].lower() text = tweet['text'] text = re.sub(filter_pattern, '', text) for sent in split_multi(text): for word in word_tokenizer(sent): freq_dict[word] += 1 save(freq_file, freq_dict)
def char_split_if_io_example(sentence): '''ADD SECTION FOR CONVERTING EVERYTHING TO LOWER CASE''' """split text into characters""" """used for input/output examples for which char level info is relevant""" i = 'Input ¶' o = 'Output ¶' ''' i = 'Input \xb6' o = 'Output \xb6' ''' sentence_encoded = sentence sentence = sentence.decode('utf-8') if i in sentence_encoded: sentence = sentence_encoded.split(i) sentence = word_tokenizer(i.decode('utf-8')) + list(sentence[1]) s = sentence for jdx, j in enumerate(sentence): if j == '\xc2': s[jdx:jdx + 2] = [u'\xb6'] sentence = s elif o in sentence_encoded: sentence = sentence_encoded.split(o) sentence = word_tokenizer(o.decode('utf-8')) + list(sentence[1]) s = sentence for jdx, j in enumerate(sentence): if j == '\xc2': s[jdx:jdx + 2] = [u'\xb6'] sentence = s else: sentence = word_tokenizer(sentence) return sentence
def validate_keywords_contexts_offsets(source, index_type='es', extract_contexts=False): """ Validate an updated document """ assert 'keyterms' in source and 'offsets' in source # Keyterms are stored as a json string keyterms = source['keyterms'] offsets = source['offsets'] if keyterms is None: # When keyterms is None it is because the extracted keyterms could not meet the filter criteria # (i.e. min_bg_count). This should only happen when min_bg_count > 1 return if index_type == 'es': field_text = source[FIELD_NAME] else: # Mongo doesn't have the raw text field; get from ES doc_id = source['_id'] es_source = get_es_source(doc_id) field_text = es_source[FIELD_NAME] # Because of ambiguity with punctuation when extracting keyterms, we # include tokens split by punct and removed punct field_tokens_set = set(tokenizer.word_tokenizer(field_text)) field_tokens_set = expand_tokens_set_to_split_by_punct(field_tokens_set) # To account for how ES handles punctuation, we use both tokenized and non-tokenized forms of each token for k in keyterms: try: assert k in field_tokens_set except AssertionError: # Try the keyterm without punct assert k.translate(str.maketrans( '', '', string.punctuation)) in field_tokens_set # Check that the keyterm offsets correspond with the raw text field for keyterm, offset in zip(keyterms, offsets): for start_idx, end_idx in offset: assert field_text[start_idx:end_idx] == keyterm if extract_contexts: assert 'contexts' in source contexts = source['contexts'] # Check that the contexts cover all the identified keyterms context_tokens_set = set() for ctx in contexts: context_tokens_set = context_tokens_set.union( tokenizer.word_tokenizer(ctx)) context_tokens_set = expand_tokens_set_to_split_by_punct( context_tokens_set) for keyterm in keyterms: try: assert keyterm in context_tokens_set except AssertionError: # Try the keyterm without punct assert keyterm.translate( str.maketrans('', '', string.punctuation)) in context_tokens_set
def __next__(self): if self._curr_row is None: raise StopIteration() row = self._curr_row if len(row) != self._row_len: msg = 'found %d columns, but expected %d at line %s:\n%s' raise IOError(msg % ( len(row), self._row_len, self._line, str(row) )) try: self._curr_row = next(self._row_gen) self._line += 1 except StopIteration: self._curr_row = None data_or_text = lambda c: row[c] if c not in self.text_columns else [] data = [data_or_text(col) for col in range(self._row_len)] for col in self.text_columns: for sentence in split_single(row[col]): sentence = self._decap(sentence) tokens = [self._lower(t) for t in word_tokenizer(sentence)] data[col].append(tokens) return data
def __next__(self): if self._curr_row is None: raise StopIteration() row = self._curr_row if len(row) != self._row_len: msg = 'found %d columns, but expected %d at line %s:\n%s' raise IOError(msg % (len(row), self._row_len, self._line, str(row))) try: self._curr_row = next(self._row_gen) self._line += 1 except StopIteration: self._curr_row = None data_or_text = lambda c: row[c] if c not in self.text_columns else [] data = [data_or_text(col) for col in range(self._row_len)] for col in self.text_columns: for sentence in split_single(row[col]): sentence = self._decap(sentence) tokens = [self._lower(t) for t in word_tokenizer(sentence)] data[col].append(tokens) return data
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: List[str] = None): self.tokens: List[Token] = [] self.labels: List[str] = labels self._embeddings: Dict = {} # optionally, directly instantiate with sentence tokens if text is not None: # tokenize the text first if option selected, otherwise assumes whitespace tokenized text if use_tokenizer: sentences = split_single(text) tokens = [] for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) text = ' '.join(tokens) # add each word in tokenized string as Token object to Sentence for word in text.split(' '): self.add_token(Token(word))
def tokenize(text, segment=True, norm=True, unique=False, min_len=2, max_sent=0): ''' Tokenize text using SegTok segmenter and tokenizer. ''' sentences = split_multi(text) if segment else [text] tokens = [] for i, s in enumerate(sentences): if max_sent and i >= max_sent: break tokens += word_tokenizer(s) if unique: tokens = list(set(tokens)) if min_len: tokens = [t for t in tokens if len(t) >= min_len] if norm: tokens = [w for t in tokens for w in normalize(t).split()] return tokens
def tokenize(self, review, should_stem = True): cleaned_review = clean_sentence(review.lower()) if should_stem: cleaned_review = " ".join([self.stemmer.stem(word) for word in cleaned_review.split()]) tokenized_review = tokenizer.word_tokenizer(cleaned_review) return tokenized_review
def run_tokenize(text: str) -> List[str]: words: List[str] = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) words.extend(contractions) words = list(filter(None, words)) return words
def tokenize(text): """ Inputs: txt Outputs: tokens tokenized by segtok.tokenizer """ tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) return tokens
def tokenize(self, text): """ tokenize the text and filter the @username (and punctuation, smiley ...), leave only words """ words = [] # list of words # text = text.decode('utf-8') text = filter_pattern.sub(' ', text) for sent in split_multi(text): for token in word_tokenizer(sent): words.append(token.encode('utf-8', 'ignore')) return words
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer: # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: token = Token(word) self.add_token(token) try: word_offset = index(word, running_offset) except: word_offset = last_word_offset + 1 if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text else: # add each word in tokenized string as Token object to Sentence for word in text.split(' '): if word: token = Token(word) self.add_token(token)
def tokenize_old(output_file, db = 'crawler'): client = MongoClient() texts = client[db]['texts'] f = open(output_file, 'w') # (TODO: some query to get specific data) for entry in texts.find(): text = entry['text'].decode('utf-8', 'ignore') # (optional: write article level data) for sent in split_multi(text): for token in word_tokenizer(sent): f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X')) f.write('\n') f.close()
def __init__(self, text=None, use_tokenizer=False, labels=None): super(Sentence, self).__init__() self.tokens = [] self.labels = [] if (labels is not None): self.add_labels(labels) self._embeddings = {} if (text is not None): if use_tokenizer: tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) index = text.index running_offset = 0 last_word_offset = (-1) last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = (last_word_offset + 1) start_position = ((running_offset + 1) if (running_offset > 0) else running_offset) token = Token(word, start_position=start_position) self.add_token(token) if (((word_offset - 1) == last_word_offset) and (last_token is not None)): last_token.whitespace_after = False word_len = len(word) running_offset = (word_offset + word_len) last_word_offset = (running_offset - 1) last_token = token else: word = u'' for (index, char) in enumerate(text): if (char == u' '): if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token) word = u'' else: word += char index += 1 if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token)
def parse_words(self): stemmer = SnowballStemmer("english") with jsonlines.open(self.src) as reader: for obj in tqdm(reader.iter(type=dict, skip_invalid=True)): # review = tokenizer.word_tokenizer(obj["text"].lower()) # curr_words += review review = obj["text"] cleaned_review = clean_sentence(review) # stemmed_review = " ".join([stemmer.stem(word) for word in cleaned_review.split()]) UNCOMMENT THIS LINE FOR STEMMING tokenized_review = tokenizer.word_tokenizer(cleaned_review.lower()) self.words.update(tokenized_review) print(len(self.words), "unique total words")
def read(origin_file, freq_file, lang): freq_dict = defaultdict(int) i = 0 for line in open(origin_file): i += 1 if i % 100000 == 0: print i items = line.strip().split(',', 3) if len(items) == 4 and items[0] == lang: # text = items[3].lower().decode('utf-8') text = items[3].decode('utf-8') text = re.sub(filter_pattern, '', text) for sent in split_multi(text): for word in word_tokenizer(sent): freq_dict[word] += 1 save(freq_file, freq_dict)
def segtok_tokenizer(text: str) -> List[Token]: """ Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages. https://github.com/fnl/segtok """ tokens: List[Token] = [] words: List[str] = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) words.extend(contractions) words = list(filter(None, words)) # determine offsets for whitespace_after field index = text.index current_offset = 0 previous_word_offset = -1 previous_token = None for word in words: #try: word_offset = index(word, current_offset) start_position = word_offset #except: # word_offset = previous_word_offset + 1 # start_position = ( # current_offset + 1 if current_offset > 0 else current_offset # ) if word: token = Token(text=word, start_position=start_position, whitespace_after=True) tokens.append(token) if (previous_token is not None) and word_offset - 1 == previous_word_offset: previous_token.whitespace_after = False current_offset = word_offset + len(word) previous_word_offset = current_offset - 1 previous_token = token return tokens
def tokenize_on_date(output_file, date = '2015-07-05'): client = MongoClient() texts = client['crawler']['texts'] f = open(output_file, 'w') # (TODO: some query to get specific data) for entry in texts.find({'date': date}): text = entry['text'].decode('utf-8', 'ignore') # (optional: write article level data) for sent in split_multi(text): for token in word_tokenizer(sent): if re.search("'s$", token): f.write('%s\t%s\n' % (token[:-2].encode('utf-8', 'ignore'), 'X')) f.write('%s\t%s\n' % (token[-2:].encode('utf-8', 'ignore'), 'X')) else: f.write('%s\t%s\n' % (token.encode('utf-8', 'ignore'), 'X')) f.write('\n') f.close()
def _n_grams(self): """Make n_grams dict from list of messages.""" main_dict = defaultdict(dict) for record in self.messages_list: tokenized = word_tokenizer(record) tokenized.insert(0, "<start_token>") tokenized.append("<end_token>") # TODO: find a better way to insert start/end tokens n_gramed = ngrams(tokenized, self.n_gram) for n_gram in n_gramed: # instead of counter, so we iterate over only once # but with if statement # TODO: estimate speed of two approaches if main_dict.get(n_gram[:-1], {}).get(n_gram[-1]): main_dict[n_gram[:-1]][n_gram[-1]] += 1 else: main_dict[n_gram[:-1]][n_gram[-1]] = 1 self.lm_dict = main_dict
def preprocess_capitalization(text): words = tokenizer.word_tokenizer(text) final_words = [] for word in words: if not word.isalpha(): final_words.append(word.lower()) else: if word.islower(): pass elif word.isupper(): final_words.append("⇧") elif word[0].isupper() and word[1:].islower(): final_words.append("↑") else: final_words.append("↑") final_words.append(word.lower()) return " ".join(final_words)
def sentences_iterator(self, log_every=10000): # Do a full pass over the data set c = 0 for batch in self.es_utility.scroll_indexed_data( self.es_field_name, self.must_have_fields, self.must_not_have_fields, self.use_analyzed_field): for d in batch: if self.use_analyzed_field: tokens = self.extract_tokens_from_termvectors( d, self.es_field_name) else: source = d['_source'] text = source[self.es_field_name] tokens = tokenizer.word_tokenizer(text) yield tokens c += 1 if c % log_every == 0: print("Processed {} documents".format(c))
def tokenize(self, tweets): """ tokenize the text and filter the @username (and punctuation, smiley ...), leave only words """ counts = [] # [5, 12, 0, 3, ...] the counts of valid words for each tweet words = [] # list of words # out = '' # one-word-per-line string of the tokenized words for morph analysis for (text, tid, uid) in tweets: i = 0 text = filter_pattern.sub(' ', text) for sent in split_multi(text): for token in word_tokenizer(sent): # words.append(token.lower().encode('utf-8', 'ignore')) words.append(token.encode('utf-8', 'ignore')) i += 1 counts.append(i) return words, counts
def _html_tokenize(sentence): """Tokenize string into words, not splitting URIS or emails, wrapping segtok:word_tokenizer. It does not split URIs or e-mail addresses. It does not treat html escapes as single characters outside of these instances.(eg. & -> '&', 'amp', ';') Args: sentence: input string for tokenization Returns: tokens: list of str """ tokens = [] for i, span in enumerate(web_tokenizer.split(sentence)): if i % 2: tokens.append(span) else: tokens.extend(word_tokenizer(span)) return tokens
def tokenize(self, tweets): """ tokenize the text and filter the @username (and punctuation, smiley ...), leave only words """ counts = [ ] # [5, 12, 0, 3, ...] the counts of valid words for each tweet words = [] # list of words # out = '' # one-word-per-line string of the tokenized words for morph analysis for (text, tid, uid) in tweets: i = 0 text = filter_pattern.sub(' ', text) for sent in split_multi(text): for token in word_tokenizer(sent): # words.append(token.lower().encode('utf-8', 'ignore')) words.append(token.encode('utf-8', 'ignore')) i += 1 counts.append(i) return words, counts
def run_tokenize(text: str) -> List[Token]: tokens: List[Token] = [] words: List[str] = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) words.extend(contractions) words = list(filter(None, words)) # determine offsets for whitespace_after field index = text.index current_offset = 0 previous_word_offset = -1 previous_token = None for word in words: try: word_offset = index(word, current_offset) start_position = word_offset except: word_offset = previous_word_offset + 1 start_position = (current_offset + 1 if current_offset > 0 else current_offset) if word: token = Token(text=word, start_position=start_position, whitespace_after=True) tokens.append(token) if (previous_token is not None) and word_offset - 1 == previous_word_offset: previous_token.whitespace_after = False current_offset = word_offset + len(word) previous_word_offset = current_offset - 1 previous_token = token return tokens
def textrank(text, hdr): # finding out the most possible language of the text lang_code = lang_identifier.classify(' '.join([hdr, text]))[0] # tokenizing for words sentences = [sentence for sentence in split_multi(text)] stemmer = snowballstemmer.stemmer(LANG_CODES.get(lang_code, 'english')) words = [set(stemmer.stemWord(word) for word in word_tokenizer(sentence.lower()) if word.isalpha()) for sentence in sentences] pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs] scores = filter(lambda x: x[2], scores) g = nx.Graph() g.add_weighted_edges_from(scores) pr = nx.pagerank(g) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True), lang_code
def __init__(self, text: str = None, use_tokenizer: str = 'split', labels: Union[List[Label], List[str]] = None): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer == 'segtok': # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = last_word_offset + 1 start_position = running_offset + 1 if running_offset > 0 else running_offset token = Token(word, start_position=start_position) self.add_token(token) if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text elif use_tokenizer == 'split': # add each word in tokenized string as Token object to Sentence offset = 0 for word in text.split(' '): if word: try: word_offset = text.index(word, offset) except: word_offset = offset token = Token(word, start_position=word_offset) self.add_token(token) offset += len(word) + 1 elif use_tokenizer == 'toki': cmd = ['toki-app', '-q', '-n', '-c', 'nkjp'] p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE) stdout = p.communicate(input=text.encode('utf-8'))[0] offset = 0 print(stdout.decode('utf-8').split('\n')) for t in stdout.decode('utf-8').split( '\n')[:-2]: #omit last two newlines print('XX', t) m = re.match(r'^(.*)/[tp]:(none|space|newline)', t) word = m.group(1) # before=m.group(2) # print(word, text) word_offset = text.index(word, offset) token = Token(word, start_position=word_offset) self.add_token(token) offset = word_offset + len(word)
def getScore(): if request.method == 'POST': data_received = request.form['mydata'].lower() is_url = validators.url(data_received) headline = "" if not is_url: # if we didn't get a url we got a headline as our data headline = data_received else: # we need to go fetch the headline from the url source = requests.get(data_received) soup = bs.BeautifulSoup(source.content, features='html.parser') headline = " " found_headline = False h1_tags = soup.find_all('h1') for h1_tag in h1_tags: potential_text = h1_tag.find(text=True, recursive=True) if len(potential_text) > 1: headline = potential_text found_headline = True if not found_headline: resp = make_response() resp.status = 400 return resp with tf.Session(graph=default_graph) as sess: model.saver.restore(sess, model_file) tokenized = tokenizer.word_tokenizer(headline) numerized = numerize_sequence(tokenized) padded, mask = pad_sequence(numerized, padI, input_length) hl_element = {} hl_element['tokenized'] = tokenized hl_element['numerized'] = padded hl_element['mask'] = mask d_hl = [hl_element] hl_input, hl_target, hl_target_mask = build_batch(d_hl, 1) feed = { model.input_num: hl_input, model.targets: hl_target, model.targets_mask: hl_target_mask } loss = sess.run([model.loss], feed_dict=feed)[0] analysis = '' if loss < 7.00: analysis = "The headline is not unusual (not impactful). The article may not have any affect on cryptocurrency prices." elif loss < 15.00: analysis = "The headline is unusual (potentially impactful). The article may have an impact on cryptocurrency prices. We recommend reading the article!" else: analysis = "The headline is highly unusual (either potentially very impactful or not related to cryptocurrency). If the article is related to cryptocurrency, we recommend reading the article in detail!" resp = make_response('{"loss": ' + str(loss) + ', "headline": "' + headline + '"' + ', "analysis": "' + analysis + '"' + '}') resp.headers['Content-Type'] = "application/json" return resp
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer: # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = last_word_offset + 1 start_position = running_offset + 1 if running_offset > 0 else running_offset token = Token(word, start_position=start_position) self.add_token(token) if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text else: # catch the empty string case if not text: raise ValueError("Cannot convert empty string to a Sentence object.") # add each word in tokenized string as Token object to Sentence word = '' for index, char in enumerate(text): if char == ' ': if len(word) > 0: token = Token(word, start_position=index-len(word)) self.add_token(token) word = '' else: word += char # increment for last token in sentence if not followed by whtespace index += 1 if len(word) > 0: token = Token(word, start_position=index-len(word)) self.add_token(token)
def get_documents(self, path, doc_length): ''' Create document list from input documents. ''' print('Processing documents ...') docs = [] for filename in [ f for f in os.listdir(path) if f[-4:] in ['.txt', '.xml'] ]: with open(path + '/' + filename) as f: print('Processing file: ' + filename) # Remove xml tags and decode if filename.endswith('.xml'): xml = etree.fromstring(f.read()) text = etree.tostring(xml, encoding='utf-8', method='text') doc = text.decode('utf-8') else: doc = self.decode(f.read()) # Process user provided regular expressions for regex in self.regex_list: doc = re.sub(regex[0], regex[1], doc, flags=re.I) # Remove unwanted characters and whitespace unwanted_chars = [ '&', '/', '|', '_', ':', '=', '(', ')', '[', ']' ] for char in unwanted_chars: doc = doc.replace(char, '') doc = ' '.join(doc.split()) # Sentence chunk with Segtok sentences = [s for s in segmenter.split_single(doc)] # Split large documents into smaller parts if doc_length > 0: sub_docs = [ sentences[i:i + doc_length] for i in xrange(0, len(sentences), doc_length) ] else: sub_docs = [sentences] # Tokenize with Segtok or Frog for sub_doc in sub_docs: tokens = [] if self.pos_tag: tokens += self.frogger(sub_doc, filename) else: for sentence in sub_doc: tokens += [ t.lower() for t in tokenizer.word_tokenizer(sentence) ] if len(tokens): docs.append(tokens) for filename in [f for f in os.listdir(path) if f.endswith('.json')]: with open(path + '/' + filename) as f: print('Processing file: ' + filename) docs += json.load(f)['docs'] print('Number of (sub)documents: ' + str(len(docs))) assert docs, 'No documents found' return docs
def segment(self, sentence): return word_tokenizer(sentence)
def segtok_tokenize(text): from segtok.tokenizer import word_tokenizer chunks = word_tokenizer(text) return find_substrings(chunks, text)
def __init__( self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None, language_code: str = None, ): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} self.language_code: str = language_code # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer: # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = last_word_offset + 1 start_position = (running_offset + 1 if running_offset > 0 else running_offset) token = Token(word, start_position=start_position) self.add_token(token) if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text else: # add each word in tokenized string as Token object to Sentence word = "" index = -1 for index, char in enumerate(text): if char == " ": if len(word) > 0: token = Token(word, start_position=index - len(word)) self.add_token(token) word = "" else: word += char # increment for last token in sentence if not followed by whtespace index += 1 if len(word) > 0: token = Token(word, start_position=index - len(word)) self.add_token(token) # log a warning if the dataset is empty if text == "": log.warn( "ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?" )
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[(List[Label], List[str])] = None, language_code: str = None): super(Sentence, self).__init__() self.tokens = [] self.labels = [] if (labels is not None): self.add_labels(labels) self._embeddings = {} self.language_code = language_code if (text is not None): if use_tokenizer: tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) index = text.index running_offset = 0 last_word_offset = (-1) last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = (last_word_offset + 1) start_position = ((running_offset + 1) if (running_offset > 0) else running_offset) token = Token(word, start_position=start_position) self.add_token(token) if (((word_offset - 1) == last_word_offset) and (last_token is not None)): last_token.whitespace_after = False word_len = len(word) running_offset = (word_offset + word_len) last_word_offset = (running_offset - 1) last_token = token else: word = '' index = (-1) for (index, char) in enumerate(text): if (char == ' '): if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token) word = '' else: word += char index += 1 if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token) if (text == ''): log.warn( 'ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?' ) self.tokenized = None
def _get_aligned_tokens_core(original_text, original_aligned, noisy_aligned, edit_ops): #TODO: empty noisy text!! status = True from segtok.tokenizer import word_tokenizer #, split_contractions tokenized_original_text = _split_contractions( word_tokenizer(original_text)) aligned_tokens = list() idx = 0 for clean_token_text in tokenized_original_text: noisy_token_text = "" token_idx = 0 # loop till the first char of the token match with the character of # the aligned original text. It will skip spurious tokens that could # arise from insertion errors between tokens. while token_idx < len(clean_token_text) and idx < len(edit_ops): char_token = clean_token_text[token_idx] char_orig = original_aligned[idx] if char_token == char_orig: break idx += 1 while token_idx < len(clean_token_text) and idx < len(edit_ops): op = edit_ops[idx] char_token = clean_token_text[token_idx] char_orig = original_aligned[idx] if op == "-": noisy_token_text += noisy_aligned[idx] token_idx += 1 check = True elif op == "s": noisy_token_text += noisy_aligned[idx] token_idx += 1 check = True elif op == "i": noisy_token_text += noisy_aligned[ idx] # insert char and do not move to the next one check = False elif op == "d": token_idx += 1 # skip char and move to the next one check = True if check and char_orig != char_token: print(f"WRONG!!! idx={idx} {char_orig} != {char_token}") print(f"noisy_token_text={noisy_token_text}") status = False idx += 1 # the next char is a whitespace (if we are not at the end of a sentence) # check whether it is substituted with another character, which will be # included into the noisy token text if idx < len(edit_ops) and edit_ops[idx] == "s" and original_aligned[ idx].isspace(): noisy_token_text += noisy_aligned[idx] idx += 1 # alternatively, there could be one or more insertions at the end # include them into the noisy token text while idx < len(edit_ops) and edit_ops[idx] == "i": noisy_token_text += noisy_aligned[idx] idx += 1 if idx < len(edit_ops) and edit_ops[idx] in [ "-", "d" ] and original_aligned[idx].isspace(): idx += 1 aligned_tokens.append((clean_token_text, noisy_token_text)) # if clean_token_text != noisy_token_text: # log.info(f"*{clean_token_text}* -> *{noisy_token_text}*") if not status: print(f"{original_text}") print(f"{original_aligned}") print(f"{noisy_aligned}") print(f"{edit_ops}") print(f"{tokenized_original_text}") print(f"{aligned_tokens}") exit(-1) return aligned_tokens, status
def extract_noisy_corpus(input_path, log, max_lines=-1, split_num_lines=int(3e6)): from segtok.tokenizer import word_tokenizer from pysia.align import _split_contractions fname, fext = os.path.splitext(input_path) max_lines_str = get_max_lines_alias(max_lines) org_dir = f"{fname}_org_{max_lines_str}" rec_dir = f"{fname}_rec_{max_lines_str}" log.info(f"Starting corpus extraction:") log.info(f"Input data directory: {input_path}") log.info(f"Original data directory: {org_dir}") log.info(f"Noisy data directory: {rec_dir}") recreate_directory(org_dir) recreate_directory(rec_dir) org_file_idx, rec_file_idx = 0, 0 org_line_idx, rec_line_idx = 0, 0 #org_line_limit, rec_line_limit = split_num_lines / 10, split_num_lines / 10 # first split for validation org_line_limit, rec_line_limit = split_num_lines, split_num_lines org_file_path = os.path.join(org_dir, f"{org_file_idx:04d}_org.txt") rec_file_path = os.path.join(rec_dir, f"{org_file_idx:04d}_rec.txt") log.info(f"opening '{org_file_path}' for writing..") org_file = open(org_file_path, "w") log.info(f"opening '{rec_file_path}' for writing..") rec_file = open(rec_file_path, "w") num_org_lines, num_rec_lines = 0, 0 with open(input_path, "r") as input_file: line = input_file.readline() line_idx = 0 while line: tokens = _split_contractions(word_tokenizer(line.strip())) line = ' '.join([tok.strip() for tok in tokens]) if line_idx % 3 == 0: # header line elems = line.split(';') if len(elems) != 3: log.error(f"Line: '{line}' length != 3'") exit(-1) elif line_idx % 3 == 1: # original text print(line.strip(), file=org_file) org_line_idx += 1 num_org_lines += 1 if org_line_idx >= org_line_limit: org_file.close() org_file_idx += 1 org_file_path = os.path.join( org_dir, f"{org_file_idx:04d}_org.txt") log.info(f"opening '{org_file_path}' for writing..") org_file = open(org_file_path, "w") org_line_idx = 0 org_line_limit = split_num_lines elif line_idx % 3 == 2: # recognized text print(line.strip(), file=rec_file) rec_line_idx += 1 num_rec_lines += 1 if rec_line_idx >= rec_line_limit: rec_file.close() rec_file_idx += 1 rec_file_path = os.path.join( rec_dir, f"{rec_file_idx:04d}_rec.txt") log.info(f"opening '{rec_file_path}' for writing..") rec_file = open(rec_file_path, "w") rec_line_idx = 0 rec_line_limit = split_num_lines if max_lines > 0: num_lines = min(num_org_lines, num_rec_lines) if num_lines >= max_lines: break line_idx += 1 line = input_file.readline() org_file.close() rec_file.close() log.info(f"Loaded {line_idx} lines.")