def get_hashtags(text: str):
    hashtags = regex.findall(r'(?:[#|#])[^\d\W][\w]*', text)
    hashtags.sort(key=lambda x: len(x))

    idx_hashtag_map = {}

    for hashtag in hashtags:
        indices = [
            m.start(0) for m in regex.finditer(regex.escape(hashtag), text)
        ]
        for idx in indices:
            idx_hashtag_map[idx] = hashtag

    return idx_hashtag_map.items()
def get_mentions(text: str):
    mentions = regex.findall(r'(?:[@|@])[^\d\W][\w]*', text)
    mentions.sort(key=lambda x: len(x))

    idx_mention_map = {}

    for mention in mentions:
        indices = [
            m.start(0) for m in regex.finditer(regex.escape(mention), text)
        ]
        for idx in indices:
            idx_mention_map[idx] = mention

    return idx_mention_map.items()
示例#3
0
def find_word(lines, expression):
    temp = []
    util = excelutil(mode='w')
    for line in lines:
        buff = ""
        for sentence in line[5].split('\n'):
            for word in regex.finditer(expression, sentence):
                if word.group(0)[2:len(word.group(0))] not in temp:
                    temp.append(word.group(0)[2:len(word.group(0))])
                buff = buff + word.group(0) + "\n"
        util.write_nextline([line[0], line[5], buff], save=False)
    for x in temp:
        print(x)
    util.save()
示例#4
0
	def replaceAll(self, line, regex, substitution, matchgroup):
		quit = False
		output = line

		while (not quit):
			startmatch = regex.finditer(output)
			quit = True

			for m in startmatch:
				if checkIfInsideString(m.start(matchgroup), self.strings):
					continue
				# todo COMMENT CHECK

				output = output[:m.start(matchgroup)] + substitution + output[m.end(matchgroup):]
				quit = False
				break 
		
		return output
示例#5
0
    async def on_message(self, message):
        ctx = await self.bot.get_context(message)
        if (ctx.valid or not ctx.guild or message.webhook_id
                or ctx.author == self.bot.user):
            return

        results = regex.finditer(self.URL_REGEX, message.content)
        urls = [result.group(0) for result in results]

        if len(urls) > 0:
            confirm = await SimpleConfirm(
                message, emoji=UNICODE_EMOJI["INSPECT"]).prompt(ctx)
            if confirm:
                async with ctx.typing():
                    for url in urls:
                        try:
                            await self.show_media(ctx, url)
                        except commands.BadArgument as error:
                            await ctx.send(error)
示例#6
0
    def find_token(self, text, date_start, date_end):
        token_reg = regex.finditer(r'[A-Z][a-zA-Z]{3,}(?: [A-Z][a-zA-Z]*){0,}',
                                   text)

        closest_token = None
        closest_distance = 100
        for token_it in token_reg:
            token = token_it.group(0)

            if token_it.start() == date_start:
                continue

            if token_it.end(
            ) < date_start and date_start - token_it.end() < closest_distance:
                closest_distance = date_start - token_it.end()
                closest_token = token
            elif token_it.start() - date_end < closest_distance:
                closest_distance = token_it.start() - date_end
                closest_token = token

        return closest_token
示例#7
0
    def parse_text(self):
        results = []
        self.text = regex.sub(r'&lt;ref.*\n?.*&lt;/ref&gt;',
                              repl="",
                              string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                if date_in_text.end + look_after > len(sentence):
                    token = self.find_token(sentence[start:],
                                            date_in_text.start,
                                            date_in_text.end)
                else:
                    token = self.find_token(
                        sentence[start:date_in_text.end + look_after],
                        date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z1-9.!?:%$ ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=token if token else self.title,
                          date=date_in_text.date,
                          info=token_context))

        return results
示例#8
0
    def parse_text(self):
        results = []
        self.text = regex.sub(r'<ref.*\n?.*</ref>', repl="", string=self.text)
        self.text = regex.sub(r'{\| class=\"wikitable.*\|}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)
        self.text = regex.sub(r'{{[cC]ite.*}}',
                              repl="",
                              string=self.text,
                              flags=regex.DOTALL)

        if self.paragraph_splitter(sep='== See also =='):
            pass
        elif self.paragraph_splitter(sep='==Notes=='):
            pass
        elif self.paragraph_splitter(sep='==References=='):
            pass
        elif self.paragraph_splitter(sep='== Bibliography =='):
            pass
        elif self.paragraph_splitter(sep='== External links =='):
            pass
        elif self.paragraph_splitter(sep='=== Sources ==='):
            pass

        sentences_reg = regex.finditer(
            r'(^| )[A-Z][^\.!?]{5,}[\.!?]',
            self.text)  # possibly [A-Z][^\.!?]{5,}[\.!?] for performance

        for sentence_it in sentences_reg:
            sentence = sentence_it.group(0)
            date_in_text = self.find_date(sentence)
            if date_in_text:
                look_before = 60
                look_after = 30
                start = date_in_text.start - look_before if date_in_text.start >= look_before else 0
                end = date_in_text.end + look_after if date_in_text.end + look_after < len(
                    sentence) else len(sentence)
                # if date_in_text.end + look_after > len(sentence):
                #     token = self.find_token(sentence[start:], date_in_text.start, date_in_text.end)
                # else:
                #     token = self.find_token(sentence[start:date_in_text.end + look_after], date_in_text.start, date_in_text.end)

                token_context = sentence[start:end]

                # token with full word at beginning
                i = start
                counter = 0
                while True:
                    i -= 1
                    counter += 1
                    if i < 0 or counter > 8:
                        break

                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context = sentence[i + 1:start] + token_context
                        break

                # token with full word at end
                i = end
                counter = 0
                while True:
                    i += 1
                    counter += 1
                    if i > len(sentence) - 1 or counter > 8:
                        break
                    if not (sentence[i].isalpha() or sentence[i].isdigit()):
                        token_context += sentence[end:end + counter]
                        break

                token_context = token_context.replace('\n', ' ')
                token_context = regex.sub(r'[^a-zA-Z0-9.!?:%$;, ]', '',
                                          token_context)
                token_context = token_context.strip()

                results.append(
                    Index(token=self.title,
                          date=date_in_text.date,
                          info=token_context))

                #  I couldnt find best word that explain the purpose, often the result was meaningful, therefore I
                #  decided not to use it.

                # tokenized = nltk.pos_tag(nltk.word_tokenize(sentence))
                #
                # proper_nouns = []
                # nouns = []
                # for (word, pos) in tokenized:
                #     if pos == 'NNP':
                #         proper_nouns.append(word)
                #     elif pos == 'NN':
                #         nouns.append(word)
                #
                # results.append(Index(token=proper_nouns[0] if proper_nouns else "title", date=date_in_text.date, info=proper_nouns[1] if
                # len(proper_nouns) > 1 else nouns[0] if nouns else ""))

        return results
示例#9
0
def analyze_unit_structure(monomer_structure):
    gaps = []
    unit_structure = []
    monomer_structure.sort(key=lambda x: x[1])
    for i in range(1, len(monomer_structure)):
        if abs(monomer_structure[i][1] -
               monomer_structure[i - 1][2]) > MONOMER_GAP_SIZE:
            gaps.append(
                (i,
                 min(monomer_structure[i - 1][2], monomer_structure[i - 1][1]),
                 max(monomer_structure[i - 1][2], monomer_structure[i - 1][1]),
                 monomer_structure[i - 1][0]))
    monomers_str = ''.join([x[0] for x in monomer_structure])
    tmers = defaultdict(int)
    for i in range(len(monomers_str) - TMER_SIZE + 1):
        tmers[monomers_str[i:i + TMER_SIZE]] += 1
    if not tmers:
        return unit_structure

    units = get_units(monomers_str, tmers)
    if not units:
        return unit_structure
    for unit, occ in units:
        for unit_occ in regex.finditer(unit, monomers_str):
            unit_start, unit_end = unit_occ.span()
            unit_structure.append((monomer_structure[unit_start][1],
                                   monomer_structure[unit_end - 1][2], unit))
            monomers_str = monomers_str[:unit_start] + 'N' * (
                unit_end - unit_start) + monomers_str[unit_end:]
    unit_structure.sort(key=lambda x: x[0])
    prev_s, prev_e = 0, 0
    i = 0
    gaps = sorted(list(gaps))
    gaps_n = 0
    while i < len(unit_structure):
        unit_start, unit_end, hor = unit_structure[i]
        while gaps_n < len(gaps) and unit_start > gaps[gaps_n][1]:
            gaps_n += 1
        if gaps_n < len(gaps) and unit_start < gaps[gaps_n][1] and gaps[
                gaps_n][2] < unit_end:
            #print(prev_e,unit_start,i,len(unit_structure),hor, gaps)
            #gaps.append((prev_e, unit_start))
            unit_structure.remove(unit_structure[i])
            if len(hor[:hor.index(gaps[gaps_n][3]) + 1]) > 1:
                unit_structure.insert(i,
                                      (unit_start, gaps[gaps_n][1],
                                       hor[:hor.index(gaps[gaps_n][3]) + 1]))
                i += 1
            if len(hor[hor.index(gaps[gaps_n][3]) + 1:]) > 1:
                unit_structure.insert(i + 1,
                                      (gaps[gaps_n][2], unit_end,
                                       hor[hor.index(gaps[gaps_n][3]) + 1:]))
                i += 1
        elif unit_start - prev_e > MONOMER_GAP_SIZE:
            pass
            #print("GAP", prev_e, unit_start)
            #gaps.append((prev_e, unit_start))
        else:
            i += 1
        prev_s, prev_e = unit_start, unit_end
    return unit_structure