def get_hashtags(text: str): hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text) hashtags.sort(key=lambda x: len(x)) idx_hashtag_map = {} for hashtag in hashtags: indices = [ m.start(0) for m in regex.finditer(regex.escape(hashtag), text) ] for idx in indices: idx_hashtag_map[idx] = hashtag return idx_hashtag_map.items()
def get_mentions(text: str): mentions = regex.findall(r'(?:[@|@])[^\d\W][\w]*', text) mentions.sort(key=lambda x: len(x)) idx_mention_map = {} for mention in mentions: indices = [ m.start(0) for m in regex.finditer(regex.escape(mention), text) ] for idx in indices: idx_mention_map[idx] = mention return idx_mention_map.items()
def find_word(lines, expression): temp = [] util = excelutil(mode='w') for line in lines: buff = "" for sentence in line[5].split('\n'): for word in regex.finditer(expression, sentence): if word.group(0)[2:len(word.group(0))] not in temp: temp.append(word.group(0)[2:len(word.group(0))]) buff = buff + word.group(0) + "\n" util.write_nextline([line[0], line[5], buff], save=False) for x in temp: print(x) util.save()
def replaceAll(self, line, regex, substitution, matchgroup): quit = False output = line while (not quit): startmatch = regex.finditer(output) quit = True for m in startmatch: if checkIfInsideString(m.start(matchgroup), self.strings): continue # todo COMMENT CHECK output = output[:m.start(matchgroup)] + substitution + output[m.end(matchgroup):] quit = False break return output
async def on_message(self, message): ctx = await self.bot.get_context(message) if (ctx.valid or not ctx.guild or message.webhook_id or ctx.author == self.bot.user): return results = regex.finditer(self.URL_REGEX, message.content) urls = [result.group(0) for result in results] if len(urls) > 0: confirm = await SimpleConfirm( message, emoji=UNICODE_EMOJI["INSPECT"]).prompt(ctx) if confirm: async with ctx.typing(): for url in urls: try: await self.show_media(ctx, url) except commands.BadArgument as error: await ctx.send(error)
def find_token(self, text, date_start, date_end): token_reg = regex.finditer(r'[A-Z][a-zA-Z]{3,}(?: [A-Z][a-zA-Z]*){0,}', text) closest_token = None closest_distance = 100 for token_it in token_reg: token = token_it.group(0) if token_it.start() == date_start: continue if token_it.end( ) < date_start and date_start - token_it.end() < closest_distance: closest_distance = date_start - token_it.end() closest_token = token elif token_it.start() - date_end < closest_distance: closest_distance = token_it.start() - date_end closest_token = token return closest_token
def parse_text(self): results = [] self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text) self.text = regex.sub(r'{\| class=\"wikitable.*\|}', repl="", string=self.text, flags=regex.DOTALL) self.text = regex.sub(r'{{[cC]ite.*}}', repl="", string=self.text, flags=regex.DOTALL) if self.paragraph_splitter(sep='== See also =='): pass elif self.paragraph_splitter(sep='==Notes=='): pass elif self.paragraph_splitter(sep='==References=='): pass elif self.paragraph_splitter(sep='== Bibliography =='): pass elif self.paragraph_splitter(sep='== External links =='): pass elif self.paragraph_splitter(sep='=== Sources ==='): pass sentences_reg = regex.finditer( r'(^| )[A-Z][^\.!?]{5,}[\.!?]', self.text) # possibly [A-Z][^\.!?]{5,}[\.!?] for performance for sentence_it in sentences_reg: sentence = sentence_it.group(0) date_in_text = self.find_date(sentence) if date_in_text: look_before = 60 look_after = 30 start = date_in_text.start - look_before if date_in_text.start >= look_before else 0 end = date_in_text.end + look_after if date_in_text.end + look_after < len( sentence) else len(sentence) if date_in_text.end + look_after > len(sentence): token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end) else: token = self.find_token( sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end) token_context = sentence[start:end] # token with full word at beginning i = start counter = 0 while True: i -= 1 counter += 1 if i < 0 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context = sentence[i + 1:start] + token_context break # token with full word at end i = end counter = 0 while True: i += 1 counter += 1 if i > len(sentence) - 1 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context += sentence[end:end + counter] break token_context = token_context.replace('\n', ' ') token_context = regex.sub(r'[^a-zA-Z1-9.!?:%$ ]', '', token_context) token_context = token_context.strip() results.append( Index(token=token if token else self.title, date=date_in_text.date, info=token_context)) return results
def parse_text(self): results = [] self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text) self.text = regex.sub(r'{\| class=\"wikitable.*\|}', repl="", string=self.text, flags=regex.DOTALL) self.text = regex.sub(r'{{[cC]ite.*}}', repl="", string=self.text, flags=regex.DOTALL) if self.paragraph_splitter(sep='== See also =='): pass elif self.paragraph_splitter(sep='==Notes=='): pass elif self.paragraph_splitter(sep='==References=='): pass elif self.paragraph_splitter(sep='== Bibliography =='): pass elif self.paragraph_splitter(sep='== External links =='): pass elif self.paragraph_splitter(sep='=== Sources ==='): pass sentences_reg = regex.finditer( r'(^| )[A-Z][^\.!?]{5,}[\.!?]', self.text) # possibly [A-Z][^\.!?]{5,}[\.!?] for performance for sentence_it in sentences_reg: sentence = sentence_it.group(0) date_in_text = self.find_date(sentence) if date_in_text: look_before = 60 look_after = 30 start = date_in_text.start - look_before if date_in_text.start >= look_before else 0 end = date_in_text.end + look_after if date_in_text.end + look_after < len( sentence) else len(sentence) # if date_in_text.end + look_after > len(sentence): # token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end) # else: # token = self.find_token(sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end) token_context = sentence[start:end] # token with full word at beginning i = start counter = 0 while True: i -= 1 counter += 1 if i < 0 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context = sentence[i + 1:start] + token_context break # token with full word at end i = end counter = 0 while True: i += 1 counter += 1 if i > len(sentence) - 1 or counter > 8: break if not (sentence[i].isalpha() or sentence[i].isdigit()): token_context += sentence[end:end + counter] break token_context = token_context.replace('\n', ' ') token_context = regex.sub(r'[^a-zA-Z0-9.!?:%$;, ]', '', token_context) token_context = token_context.strip() results.append( Index(token=self.title, date=date_in_text.date, info=token_context)) # I couldnt find best word that explain the purpose, often the result was meaningful, therefore I # decided not to use it. # tokenized = nltk.pos_tag(nltk.word_tokenize(sentence)) # # proper_nouns = [] # nouns = [] # for (word, pos) in tokenized: # if pos == 'NNP': # proper_nouns.append(word) # elif pos == 'NN': # nouns.append(word) # # results.append(Index(token=proper_nouns[0] if proper_nouns else "title", date=date_in_text.date, info=proper_nouns[1] if # len(proper_nouns) > 1 else nouns[0] if nouns else "")) return results
def analyze_unit_structure(monomer_structure): gaps = [] unit_structure = [] monomer_structure.sort(key=lambda x: x[1]) for i in range(1, len(monomer_structure)): if abs(monomer_structure[i][1] - monomer_structure[i - 1][2]) > MONOMER_GAP_SIZE: gaps.append( (i, min(monomer_structure[i - 1][2], monomer_structure[i - 1][1]), max(monomer_structure[i - 1][2], monomer_structure[i - 1][1]), monomer_structure[i - 1][0])) monomers_str = ''.join([x[0] for x in monomer_structure]) tmers = defaultdict(int) for i in range(len(monomers_str) - TMER_SIZE + 1): tmers[monomers_str[i:i + TMER_SIZE]] += 1 if not tmers: return unit_structure units = get_units(monomers_str, tmers) if not units: return unit_structure for unit, occ in units: for unit_occ in regex.finditer(unit, monomers_str): unit_start, unit_end = unit_occ.span() unit_structure.append((monomer_structure[unit_start][1], monomer_structure[unit_end - 1][2], unit)) monomers_str = monomers_str[:unit_start] + 'N' * ( unit_end - unit_start) + monomers_str[unit_end:] unit_structure.sort(key=lambda x: x[0]) prev_s, prev_e = 0, 0 i = 0 gaps = sorted(list(gaps)) gaps_n = 0 while i < len(unit_structure): unit_start, unit_end, hor = unit_structure[i] while gaps_n < len(gaps) and unit_start > gaps[gaps_n][1]: gaps_n += 1 if gaps_n < len(gaps) and unit_start < gaps[gaps_n][1] and gaps[ gaps_n][2] < unit_end: #print(prev_e,unit_start,i,len(unit_structure),hor, gaps) #gaps.append((prev_e, unit_start)) unit_structure.remove(unit_structure[i]) if len(hor[:hor.index(gaps[gaps_n][3]) + 1]) > 1: unit_structure.insert(i, (unit_start, gaps[gaps_n][1], hor[:hor.index(gaps[gaps_n][3]) + 1])) i += 1 if len(hor[hor.index(gaps[gaps_n][3]) + 1:]) > 1: unit_structure.insert(i + 1, (gaps[gaps_n][2], unit_end, hor[hor.index(gaps[gaps_n][3]) + 1:])) i += 1 elif unit_start - prev_e > MONOMER_GAP_SIZE: pass #print("GAP", prev_e, unit_start) #gaps.append((prev_e, unit_start)) else: i += 1 prev_s, prev_e = unit_start, unit_end return unit_structure