def test_re_escape(self): p = "" # This had to change from the original test of range(0,256) # because we can't support non-ascii non-utf8 strings for i in range(0, 128): p = p + chr(i) self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None, True) self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1)) pat = re.compile(re.escape(p)) self.assertEqual(pat.match(p) is not None, True) self.assertEqual(pat.match(p).span(), (0,128))
def test_re_escape(self): p="" # This had to change from the original test of range(0,256) # because we can't support non-ascii non-utf8 strings for i in range(0, 128): p = p + chr(i) self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None, True) self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1)) pat=re.compile(re.escape(p)) self.assertEqual(pat.match(p) is not None, True) self.assertEqual(pat.match(p).span(), (0,128))
def get_pattern(self, subject, modifiers): # cast to lists, so we're not splitting a single string if not isinstance(getattr(self, subject), list): setattr(self, subject, [getattr(self, subject)]) if not isinstance(modifiers, list): modifiers = list(modifiers.split(' ')) # cast all elements to strings in case of any numbers values = [unicode(val) for val in getattr(self, subject)] if 'regex' not in modifiers: values = [re.escape(val) for val in values] value_str = u'({0})'.format('|'.join(values)) # check if they defined a match modifier for mod in self._match_modifiers: if mod in modifiers: match_mod = mod break else: subject = self.trimmed_key(subject) # handle subdomains for domain checks if subject == 'domain': value_str = ur'(?:.*?\.)?' + value_str match_mod = self._modifier_defaults.get(subject, 'includes-word') return self._match_modifiers[match_mod].format(value_str)
def test_basic_re_sub(self): self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), '9.3 -3 24x100y') self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), '9.3 -3 23x99y') self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') self.assertEqual(re.sub('.', r"\n", 'x'), '\n') s = r"\1\1" self.assertEqual(re.sub('(.)', s, 'x'), 'xx') self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s) self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') self.assertEqual( re.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', 'a'), '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') self.assertEqual( re.sub('a', '\t\n\v\r\f\a', 'a'), (chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7))) self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
def test_basic_re_sub(self): self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), '9.3 -3 24x100y') self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), '9.3 -3 23x99y') self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') self.assertEqual(re.sub('.', r"\n", 'x'), '\n') s = r"\1\1" self.assertEqual(re.sub('(.)', s, 'x'), 'xx') self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s) self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
def test_basic_re_sub(self): self.assertEqual(re.sub(b"(?i)b+", b"x", b"bbbb BBBB"), b'x x') self.assertEqual(re.sub(b'\\d+', self.bump_num, b'08.2 -2 23x99y'), b'9.3 -3 24x100y') self.assertEqual(re.sub(b'\\d+', self.bump_num, b'08.2 -2 23x99y', 3), b'9.3 -3 23x99y') self.assertEqual(re.sub(b'.', lambda m: b"\\n", b'x'), b'\\n') self.assertEqual(re.sub(b'.', b"\\n", b'x'), b'\n') s = b"\\1\\1" self.assertEqual(re.sub(b'(.)', s, b'x'), b'xx') self.assertEqual(re.sub(b'(.)', re.escape(s), b'x'), s) self.assertEqual(re.sub(b'(.)', lambda m: s, b'x'), s) self.assertEqual(re.sub(b'(?P<a>x)', b'\g<a>\g<a>', b'xx'), b'xxxx') self.assertEqual(re.sub(b'(?P<a>x)', b'\g<a>\g<1>', b'xx'), b'xxxx') self.assertEqual(re.sub(b'(?P<unk>x)', b'\g<unk>\g<unk>', b'xx'), b'xxxx') self.assertEqual(re.sub(b'(?P<unk>x)', b'\g<1>\g<1>', b'xx'), b'xxxx') self.assertEqual( re.sub(b'a', b'\\t\\n\\v\\r\\f\\a\\b\\B\\Z\\a\\A\\w\\W\\s\\S\\d\\D', b'a'), b'\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') self.assertEqual(re.sub(b'a', b'\t\n\v\r\f\a', b'a'), b'\t\n\v\r\f\a') self.assertEqual(re.sub(b'a', b'\t\n\v\r\f\a', b'a'), (chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7)).encode('utf-8')) self.assertEqual(re.sub(b'^\s*', b'X', b'test'), b'Xtest')
def keyword_stem_regex(tokenizer, keyword, name=None): keyword_re = [] word_tokens = word_segs(tokenizer, keyword.lower()) for word in word_tokens: allow_ending = ( all((tok.isalpha() for tok in word)) and len(word) >= 2 ) letters_stemmed = len("".join((w.decode("utf-8") for w in word[:-1]))) do_stem = allow_ending and len(word) >= 3 and letters_stemmed >= 4 if do_stem: del word[-1] joined = b"".join(word) keyword_re.append( re2.escape(joined) + (br"\pL*" if allow_ending else b"") ) if not keyword_re: return None if name is not None: capture_group = b"?P<%s>" % name.encode("utf-8") else: capture_group = b"" return ( br"(" + capture_group + br"\s+".join(keyword_re) + br")" )
def fit(self): title_regs = [] for ne in tqdm(self.named_entities, desc="fit ner"): for title in ne.get_titles(lang="en", with_disambiguation=False): title_regs += [ re.escape(expansion) for expansion in TextNormalizer.get_rabbi_expansions(title) ] title_regs.sort(key=lambda x: len(x), reverse=True) word_breakers = r"|".join( re.escape(breaker) for breaker in [ '.', ',', '"', '?', '!', '(', ')', '[', ']', '{', '}', ':', ';', '§', '<', '>', "'s" ]) self.named_entity_regex = re.compile( fr"(?:^|\s|{word_breakers})({'|'.join(title_regs)})(?:\s|{word_breakers}|$)" )
def get_rabbi_regex(cls, rabbi): reg = rabbi.replace( cls.b_token, f"(?:{u'|'.join(re.escape(b) for b in cls.b_replacements)})") for starter in cls.starting_replacements: starter = re.escape(starter) reg = re.sub(f'^{starter}', f"(?:{starter.lower()}|{starter})", reg) return reg
def run(self): ip_indicators = [ "204.93.183.196", "50.31.146.109", "5.135.208.53", "103.25.59.120", "50.97.99.2", "173.203.112.215", "27.124.127.10", "78.129.181.191", "204.197.254.94", "50.31.146.134", ] match_file = self.check_file(pattern=".*\\\\Application\\ Data\\\\Microsoft\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$", regex=True, all=True) match_batch_file = self.check_file(pattern=".*\\\\Application\\ Data\\\\\d{1,10}\.bat$", regex=True, all=True) match_runkey = self.check_key(pattern=".*\\\\Microsoft\\\\Windows\\\\CurrentVersion\\\\Run\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$", regex=True, all=True) match_otherkey = self.check_key(pattern=".*\\\\Microsoft\\\\Office\\\\Common\\\\(?P<hex>[A-F0-9]+)\\\\(?P=hex)(CS|PS|SS|RS)", regex=True, all=True) match_mutex = self.check_mutex(pattern="^[A-F0-9]{1,8}(I|M|RM)$", regex=True, all=True) found_match_ip = False found_match_url = False if match_file: for match in match_file: self.data.append({"file": match}) if match_batch_file: for match in match_batch_file: self.data.append({"batchfile": match}) if match_runkey: for match in match_runkey: self.data.append({"runkey": match}) if match_otherkey: for match in match_otherkey: self.data.append({"otherkey": match}) if match_mutex: for match in match_mutex: self.data.append({"mutex": match}) for ip_indicator in ip_indicators: match_ip = self.check_ip(pattern=ip_indicator) if match_ip: self.data.append({"ip": match_ip}) found_match_ip = True match_url = self.check_url(pattern="http://" + re.escape(ip_indicator) + ":8080/[a-f0-9]{1,8}/[a-f0-9]{1,8}/", regex=True,all=True) if match_url: for match in match_url: self.data.append({"url": match}) found_match_url = True if match_file or match_batch_file or match_mutex or found_match_ip or found_match_url or match_runkey or match_otherkey: return True return False
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) # Don't look up stuff twice if self.results.has_key(eventData): self.sf.debug("Skipping " + eventData + " as already mapped.") return None else: self.results[eventData] = True data = self.query(eventData) if data == None: return None for n in data: e = SpiderFootEvent("LEAKSITE_URL", n, self.__name__, event) self.notifyListeners(e) res = self.sf.fetchUrl(n, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent']) if res['content'] is None: self.sf.debug("Ignoring " + n + " as no data returned") continue # Sometimes pastes search results false positives if re.search( "[^a-zA-Z\-\_0-9]" + re.escape(eventData) + "[^a-zA-Z\-\_0-9]", res['content'], re.IGNORECASE) is None: continue try: startIndex = res['content'].index(eventData) except BaseException as e: self.sf.debug("String not found in pastes content.") continue evt = SpiderFootEvent("LEAKSITE_CONTENT", res['content'], self.__name__, e) self.notifyListeners(evt)
def _add(self, message: IRCMessage): """add - Adds a quote to the OutOfContext log. The quote will be pulled from a message line buffer.""" if len(message.parameterList) < 2: return IRCResponse(ResponseType.Say, "Add what?", message.replyTo) if message.targetType == TargetTypes.USER: return IRCResponse(ResponseType.Say, "You can only add messages from channels.", message.replyTo) regex = re2.compile(re2.escape(" ".join(message.parameterList[1:])), re2.IGNORECASE) if len(self.messageStore ) == 0 or message.channel not in self.messageStore: return IRCResponse(ResponseType.Say, "Sorry, there are no messages in my buffer.", message.replyTo) matches = list(filter(regex.search, self.messageStore[message.channel])) if len(matches) == 0: return IRCResponse( ResponseType.Say, "Sorry, that didn't match anything in my message buffer.", message.replyTo) if len(matches) > 1: return IRCResponse( ResponseType.Say, "Sorry, that matches too many lines in my message buffer.", message.replyTo) todayDate = time.strftime("[%Y-%m-%d] [%H:%M]") quote = f"{todayDate} {matches[0]}" if message.replyTo not in self.storage: self.storage[message.replyTo] = [] if len(self.storage[message.replyTo]) > 0 and self.storage[ message.replyTo][-1] == quote: return IRCResponse( ResponseType.Say, "That quote has already been added to the log!", message.replyTo) else: self.storage[message.replyTo].append(quote) return IRCResponse(ResponseType.Say, f"Quote '{quote}' was added to the log!", message.replyTo)
def run(): ''' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ start = time() # Browser br = mechanize.Browser() # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) #br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent br.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')] print "initiated browser: " + str(time()-start) + " seconds" #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # volume/page of JAMA review articles from 2000/01 to 2013/04/1 vol_pg_tuples = [('309', '1278'), ('309', '1163'), ('309', '926'), ('309', '919'), ('309', '814'), ('309', '706'), ('309', '678'), ('309', '594'), ('308', '2507'), ('309', '71'), ('308', '2612'), ('308', '1024'), ('308', '502'), ('307', '2526'), ('307', '2079'), ('307', '2418'), ('307', '1959'), ('307', '1185'), ('307', '1072'), ('307', '713'), ('307', '294'), ('307', '182'), ('306', '2704'), ('306', '2011'), ('306', '1782'), ('306', '1688'), ('306', '1359'), ('306', '1241'), ('306', '978'), ('306', '746'), ('306', '627'), ('306', '420'), ('305', '2335'), ('305', '1790'), ('305', '1327'), ('305', '1225'), ('305', '1119'), ('305', '1008'), ('305', '698'), ('305', '487'), ('305', '284'), ('305', '78'), ('304', '2628'), ('304', '2161'), ('304', '2048'), ('304', '1592'), ('304', '890'), ('304', '779'), ('304', '452'), ('304', '321'), ('304', '76'), ('303', '2280'), ('303', '1848'), ('303', '1738'), ('303', '1729'), ('303', '1526'), ('303', '1295'), ('303', '1180'), ('303', '1077'), ('303', '865'), ('303', '438'), ('303', '47'), ('302', '2679'), ('302', '2345'), ('302', '2243'), ('302', '2135'), ('302', '1316'), ('302', '985'), ('302', '550'), ('302', '537'), ('302', '412'), ('302', '179'), ('301', '2472'), ('301', '2362'), ('301', '2349'), ('302', '73'), ('301', '2129'), ('301', '1358'), ('301', '636'), ('301', '954'), ('301', '415'), ('301', '309'), ('300', '2886'), ('300', '2779'), ('300', '2754'), ('300', '2647'), ('300', '2638'), ('300', '2514'), ('301', '82'), ('300', '2407'), ('300', '2286'), ('300', '2277'), ('300', '2161'), ('300', '1793'), ('300', '1674'), ('300', '2036'), ('300', '1439'), ('300', '1181'), ('300', '711'), ('300', '555'), ('300', '197'), ('299', '2777'), ('299', '2423'), ('299', '1937'), ('299', '1698'), ('299', '1446'), ('299', '1320'), ('299', '1166'), ('299', '937'), ('299', '925'), ('299', '914'), ('299', '806'), ('299', '793'), ('299', '672'), ('299', '324'), ('299', '555'), ('298', '2895'), ('298', '2654'), ('298', '2296'), ('298', '2171'), ('298', '1911'), ('298', '1900'), ('298', '1429'), ('298', '1312'), ('298', '1300'), ('298', '1038'), ('298', '1023'), ('298', '902'), ('298', '786'), ('298', '655'), ('298', '438'), ('298', '194'), ('298', '70'), ('298', '61'), ('297', '2741'), ('297', '2617'), ('297', '2603'), ('297', '2502'), ('297', '2391'), ('297', '2381'), ('297', '2264'), ('297', '2251'), ('297', '2241'), ('297', '2018'), ('297', '1810'), ('297', '1697'), ('297', '1583'), ('297', '1551'), ('297', '1478'), ('297', '1241'), ('297', '1233'), ('297', '986'), ('297', '842'), ('297', '831'), ('297', '733'), ('297', '724'), ('297', '77'), ('296', '2839'), ('296', '2558'), ('296', '2234'), ('296', '2012'), ('296', '1885'), ('296', '1764'), ('296', '1731'), ('296', '1507'), ('296', '1377'), ('296', '1274'), ('296', '1619'), ('296', '1633'), ('296', '1116'), ('296', '1103'), ('296', '1094'), ('296', '974'), ('296', '815'), ('296', '679'), ('296', '445'), ('296', '427'), ('295', '2765'), ('295', '2286'), ('295', '2275'), ('295', '2057'), ('295', '1824'), ('295', '1688'), ('295', '1566'), ('295', '1288'), ('295', '1050'), ('295', '809'), ('295', '547'), ('295', '536'), ('295', '416'), ('295', '403'), ('295', '199'), ('294', '3124'), ('294', '2889'), ('294', '2751'), ('294', '2623'), ('294', '2342'), ('294', '2203'), ('294', '2064'), ('294', '1944'), ('287', '2784'), ('284', '1417'), ('287', '1301'), ('289', '3161'), ('289', '1976'), ('291', '2865'), ('294', '947'), ('289', '217'), ('285', '2498'), ('288', '2793'), ('289', '331'), ('285', '1819'), ('291', '2013'), ('293', '3043'), ('293', '1509'), ('292', '972'), ('289', '1837'), ('289', '2992'), ('283', '2568'), ('286', '1610'), ('292', '726'), ('292', '1593'), ('287', '2701'), ('288', '2151'), ('284', '2919'), ('289', '3145'), ('287', '2335'), ('290', '1001'), ('294', '725'), ('289', '747'), ('293', '730'), ('283', '1451'), ('284', '1820'), ('285', '1415'), ('287', '2570'), ('285', '1613'), ('287', '2869'), ('284', '2785'), ('290', '1360'), ('285', '3065'), ('293', '2391'), ('291', '2367'), ('288', '1388'), ('293', '1906'), ('284', '215'), ('293', '1089'), ('287', '1233'), ('286', '208'), ('291', '870'), ('284', '934'), ('290', '248'), ('291', '358'), ('287', '1840'), ('293', '855'), ('292', '1989'), ('294', '97'), ('285', '193'), ('288', '1116'), ('292', '2890'), ('293', '90'), ('289', '1288'), ('291', '1610'), ('290', '2599'), ('287', '1502'), ('294', '1088'), ('289', '1681'), ('292', '1480'), ('288', '2579'), ('293', '2372'), ('288', '611'), ('291', '99'), ('286', '2516'), ('291', '986'), ('290', '86'), ('283', '381'), ('285', '2763'), ('287', '487'), ('287', '883'), ('283', '3110'), ('287', '1308'), ('293', '596'), ('292', '1602'), ('293', '1245'), ('293', '2012'), ('293', '1644'), ('286', '1360'), ('288', '1889'), ('291', '228'), ('286', '2787'), ('285', '1489'), ('287', '226'), ('294', '1534'), ('292', '852'), ('286', '1218'), ('288', '3137'), ('290', '2464'), ('288', '2233'), ('291', '2359'), ('289', '2475'), ('293', '979'), ('287', '1848'), ('290', '524'), ('293', '1653'), ('290', '932'), ('283', '1469'), ('292', '2755'), ('286', '2308'), ('287', '622'), ('291', '1999'), ('287', '2414'), ('287', '1022'), ('285', '1059'), ('293', '2141'), ('287', '425'), ('289', '2254'), ('291', '1887'), ('293', '987'), ('287', '2691'), ('286', '2143'), ('289', '2857'), ('293', '1223'), ('292', '367'), ('288', '932'), ('285', '1338'), ('285', '2891'), ('294', '238'), ('293', '1501'), ('292', '1724'), ('286', '895'), ('293', '477'), ('290', '1767'), ('292', '1867'), ('292', '2901'), ('290', '659'), ('291', '2746'), ('289', '589'), ('289', '347'), ('286', '341'), ('291', '605'), ('287', '1972'), ('283', '2008'), ('283', '3244'), ('289', '210'), ('288', '2868'), ('286', '2000'), ('293', '2641'), ('288', '2569'), ('291', '1127'), ('284', '412'), ('292', '2880'), ('286', '2296'), ('286', '3056'), ('288', '2167'), ('288', '872'), ('285', '1193'), ('285', '992'), ('289', '2413'), ('287', '1435'), ('285', '2055'), ('292', '97'), ('286', '1149'), ('292', '1074'), ('291', '1238'), ('291', '1368'), ('290', '2849'), ('290', '2057'), ('288', '2458'), ('285', '2232'), ('286', '442'), ('288', '629'), ('290', '2455'), ('288', '1901'), ('287', '2114'), ('288', '2724'), ('289', '80'), ('284', '1689'), ('289', '3300'), ('292', '2874'), ('291', '2243'), ('292', '89'), ('287', '92'), ('293', '1367'), ('289', '2545'), ('290', '1633'), ('287', '762'), ('288', '2449'), ('292', '2771'), ('290', '2301'), ('290', '1510'), ('285', '1186'), ('283', '3102'), ('285', '785'), ('291', '736'), ('292', '237'), ('292', '2622'), ('290', '1906'), ('289', '2041'), ('285', '1987'), ('289', '2120'), ('290', '2476'), ('284', '1549'), ('294', '1671'), ('286', '2270'), ('287', '2391'), ('283', '2281'), ('286', '2981'), ('293', '2257'), ('287', '360'), ('283', '1800'), ('286', '2441'), ('289', '2849'), ('287', '2120'), ('289', '895'), ('292', '490'), ('288', '1622'), ('293', '217'), ('287', '236'), ('291', '350'), ('291', '1487'), ('287', '2917'), ('286', '944'), ('286', '821'), ('288', '745'), ('288', '222'), ('287', '2236'), ('293', '349'), ('292', '2388'), ('287', '628'), ('285', '386'), ('287', '2821'), ('284', '1828'), ('286', '954'), ('291', '1763'), ('292', '3017'), ('288', '351'), ('289', '454'), ('288', '1610'), ('287', '3116'), ('290', '719')] for count, vol_pg_tuple in enumerate(vol_pg_tuples): url = 'http://jama.jamanetwork.com/article.aspx?volume=%s&page=%s' % vol_pg_tuple try: sys.stdout.write("article # " + str(count) + " reading url...") start = time() r = br.open(url) entry_url = r.geturl() entry_html_source = r.read() soup = BeautifulSoup(entry_html_source.decode('utf-8'), 'html5lib') is_free = soup.find(class_='freeArticle') if is_free is None: sys.stdout.write(str(time()-start) + " seconds") sys.stdout.write("...skipping, article not free.\n") sys.stdout.flush() else: sys.stdout.write("adding to database...") # format of returned list from get_metadata function: # 0 identifier # 1 type # 2 language # 3 title # 4 date # 5 publisher # 6 author # 7 journal # 8 volume # 9 issue # 10 firstpage # 11 lastpage # 12 url res_metadata = parser.get_metadata(entry_url, entry_html_source) res_metadata[1] = 'JAMA review articles' res_identifier = res_metadata[0] # creates new Resource object and containing Subresource objects # creates Resource based on returned parser metadata res = Resource(identifier = res_metadata[0], type = res_metadata[1], language = res_metadata[2], title = res_metadata[3], date = res_metadata[4], publisher = res_metadata[5], author = res_metadata[6], journal = res_metadata[7], volume = res_metadata[8], issue = res_metadata[9], firstpage = res_metadata[10], lastpage = res_metadata[11], url = entry_url, html_source = entry_html_source) res.save() res.user.add(9) # corresponds to [email protected] #res.user.add(2) # corresponds to [email protected] res.domain.add(1) # corresponds to Biomedical subres = [] # creates Subresource objects of type 'figure' figures = parser.get_figures(entry_url, entry_html_source) for i, figure in enumerate(figures): subres.append(Subresource(containing_resource = res, name = figure[0].split('. ')[0], type = 'figure', content = u'. '.join(figure[0].split('. ')[1:]) + u'. ' + figure[1], url = figure[4])) # creates Subresource objects of type 'paragraph' paragraphs = parser.get_paragraphs(entry_url, entry_html_source) for i, paragraph in enumerate(paragraphs): subres.append(Subresource(containing_resource = res, name = 'paragraph ' + str(i), type = 'paragraph', content = paragraph)) subres_temp = Subresource.objects.bulk_create(subres) del subres_temp del subres sys.stdout.write(str(time()-start) + " seconds\n") sys.stdout.flush() except Exception, e: print "failed. exception: "+str(e) traceback.print_exc() ''' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sys.stdout.write("~~~~loading concepts and term lists... ") start = time() file = open('scripts\MESH_concept_and_terms_tuple.pkl', 'rb') (tot_concepts, concept_IDs, term_lists) = pickle_zloads(file.read()) file.close() sys.stdout.write("%.2f" % (time() - start) + "seconds\n") sys.stdout.flush() res_ids = list( Resource.objects.filter(type="JAMA review articles").values_list( 'id', flat=True)) print "total # of resources: " + str(len(res_ids)) for count, res_id in enumerate(res_ids): try: sys.stdout.write("article # " + str(count) + " processing...") start = time() target_paragraphs = Subresource.objects.filter( containing_resource_id=res_id) #create sentences from target_paragraphs sentences = [] sentences_indexofparagraph = [] tot_para = 0 tot_sent = 0 for para_num, target_paragraph in enumerate(target_paragraphs): #find all sentence in this paragraph tokenized_sentences = sent_tokenize( target_paragraph.content.rstrip()) sentences.extend(tokenized_sentences) sentences_indexofparagraph.extend([para_num] * len(tokenized_sentences)) tot_sent = tot_sent + len(tokenized_sentences) tot_para = tot_para + 1 tot_para = len(target_paragraphs) #second go through each concept/term, find them in subresources, and process into matrix tc = 0 j = 0 row_sentence = [] row_paragraph = [] col_sentence = [] col_paragraph = [] data_sentence = [] data_paragraph = [] # initialize list of empty lists for storing concepts contained in each paragraph para_conceptIDs_contained = [[] for i in range(tot_para)] for i, con_ID in enumerate(concept_IDs): term_list = term_lists[i] wordcount_in_paragraphs = [0] * tot_para terms_regex = [ r"\b" + re2.escape(term.lower()) + r"\b" for term in term_list ] search_pattern = re2.compile("|".join(terms_regex)) for sent_num, sentence in enumerate(sentences): wordcount = len(search_pattern.findall(sentence.lower())) if wordcount > 0: #only go ahead if search_pattern is in the sentence row_sentence.append(sent_num) col_sentence.append(tc) data_sentence.append(1) wordcount_in_paragraphs[ sentences_indexofparagraph[sent_num]] += wordcount for para_num in range(tot_para): wordcount_in_p = wordcount_in_paragraphs[para_num] if wordcount_in_p > 0: row_paragraph.append(para_num) col_paragraph.append(tc) data_paragraph.append(1) para_conceptIDs_contained[para_num].append(con_ID) if tc * 10 / tot_concepts > j: percent_done = tc * 10 / tot_concepts * 10 sys.stdout.write(str(percent_done) + "% ") j = j + 1 tc = tc + 1 # update concepts_contained fields for all subresource objects for para_num in range(tot_para): if len(para_conceptIDs_contained[para_num]) > 0: target_paragraphs[para_num].concepts_contained.add( *para_conceptIDs_contained[para_num]) #create target_A matrix target_A_sentence = coo_matrix( (array(data_sentence), (array(row_sentence), array(col_sentence))), shape=(tot_sent, tot_concepts), dtype=int16) #target_A_paragraph = coo_matrix((array(data_paragraph),(array(row_paragraph),array(col_paragraph))),shape=(tot_para,tot_concepts),dtype=int16) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # now convert target_A into a scipy csr_matrix (sparse matrix) target_A_sentence = target_A_sentence.tocsr() #target_A_paragraph = target_A_paragraph.tocsr() # calculate AtA for target_A AtA_sentence = target_A_sentence.T * target_A_sentence #AtA_paragraph = target_A_paragraph.T * target_A_paragraph # add AtA to Big_A if count == 0: bigA_AtA_sentence = AtA_sentence N_sentence = tot_sent #bigA_AtA_paragraph = AtA_paragraph #N_paragraph = tot_para else: bigA_AtA_sentence = bigA_AtA_sentence + AtA_sentence N_sentence = N_sentence + tot_sent #bigA_AtA_paragraph = bigA_AtA_paragraph + AtA_paragraph #N_paragraph = N_paragraph + tot_para sys.stdout.write(str(time() - start) + " seconds\n") sys.stdout.flush() except Exception, e: print "failed. exception: " + str(e) traceback.print_exc()
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if self.errorState: return None if self.opts['api_key'] == "": self.sf.error( "You enabled sfp_pastebin but did not set a Google API key!", False) self.errorState = True return None if eventData in self.results: return None else: self.results[eventData] = True for dom in self.domains.keys(): links = list() target = self.domains[dom] res = self.sf.googleIterate( searchString="+site:{target_site} \"{search_keyword}\"".format( target_site=target, search_keyword=eventData, ), opts={ "timeout": self.opts["_fetchtimeout"], "useragent": self.opts["_useragent"], "api_key": self.opts["api_key"], "cse_id": self.opts["cse_id"], }, ) if res is None: # Failed to talk to the Google API or no results returned return None urls = res["urls"] new_links = list(set(urls) - set(self.results.keys())) # Add new links to results for l in new_links: self.results[l] = True relevant_links = [ link for link in new_links if self.sf.urlBaseUrl(link).endswith(target) ] for link in relevant_links: self.sf.debug("Found a link: " + link) if self.checkForStop(): return None res = self.sf.fetchUrl(link, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent']) if res['content'] is None: self.sf.debug("Ignoring " + link + " as no data returned") continue # Sometimes pastes search results false positives if re.search( "[^a-zA-Z\-\_0-9]" + re.escape(eventData) + "[^a-zA-Z\-\_0-9]", res['content'], re.IGNORECASE) is None: continue try: startIndex = res['content'].index(eventData) except BaseException as e: self.sf.debug("String not found in pastes content.") continue evt1 = SpiderFootEvent("LEAKSITE_URL", link, self.__name__, event) self.notifyListeners(evt1) evt2 = SpiderFootEvent("LEAKSITE_CONTENT", res['content'], self.__name__, evt1) self.notifyListeners(evt2)
def run(self): ip_indicators = [ "204.93.183.196", "50.31.146.109", "5.135.208.53", "103.25.59.120", "50.97.99.2", "173.203.112.215", "27.124.127.10", "78.129.181.191", "204.197.254.94", "50.31.146.134", ] match_file = self.check_file( pattern= ".*\\\\Application\\ Data\\\\Microsoft\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$", regex=True, all=True) match_batch_file = self.check_file( pattern=".*\\\\Application\\ Data\\\\\d{1,10}\.bat$", regex=True, all=True) match_runkey = self.check_key( pattern= ".*\\\\Microsoft\\\\Windows\\\\CurrentVersion\\\\Run\\\\[a-z]{3}(api32|audio|bios|boot|cap32|common|config|crypt|edit32|error|mgr32|serial|setup|share|sock|system|update|video|windows)\.exe$", regex=True, all=True) match_otherkey = self.check_key( pattern= ".*\\\\Microsoft\\\\Office\\\\Common\\\\(?P<hex>[A-F0-9]+)\\\\(?P=hex)(CS|PS|SS|RS)", regex=True, all=True) match_mutex = self.check_mutex(pattern="^[A-F0-9]{1,8}(I|M|RM)$", regex=True, all=True) found_match_ip = False found_match_url = False if match_file: for match in match_file: self.data.append({"file": match}) if match_batch_file: for match in match_batch_file: self.data.append({"batchfile": match}) if match_runkey: for match in match_runkey: self.data.append({"runkey": match}) if match_otherkey: for match in match_otherkey: self.data.append({"otherkey": match}) if match_mutex: for match in match_mutex: self.data.append({"mutex": match}) for ip_indicator in ip_indicators: match_ip = self.check_ip(pattern=ip_indicator) if match_ip: self.data.append({"ip": match_ip}) found_match_ip = True match_url = self.check_url(pattern="http://" + re.escape(ip_indicator) + ":8080/[a-f0-9]{1,8}/[a-f0-9]{1,8}/", regex=True, all=True) if match_url: for match in match_url: self.data.append({"url": match}) found_match_url = True if match_file or match_batch_file or match_mutex or found_match_ip or found_match_url or match_runkey or match_otherkey: return True return False
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if self.errorState: return None self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) if self.opts['api_key'] == "": self.sf.error( "You enabled sfp_onioncity but did not set a Google API key!", False) self.errorState = True return None if eventData in self.results: self.sf.debug("Already did a search for " + eventData + ", skipping.") return None else: self.results[eventData] = True # Sites hosted on the domain res = self.sf.googleIterate( searchString="+site:onion.link " + eventData, opts={ "timeout": self.opts["_fetchtimeout"], "useragent": self.opts["_useragent"], "api_key": self.opts["api_key"], "cse_id": self.opts["cse_id"], }, ) if res is None: # Failed to talk to the bing API or no results returned return None urls = res["urls"] new_links = list(set(urls) - set(self.results.keys())) # Add new links to results for l in new_links: self.results[l] = True # Submit the Google results for analysis googlesearch_url = res["webSearchUrl"] response = self.sf.fetchUrl( googlesearch_url, timeout=self.opts["_fetchtimeout"], useragent=self.opts["_useragent"], ) if response['code'].startswith('2'): evt = SpiderFootEvent("RAW_RIR_DATA", response["content"], self.__name__, event) self.notifyListeners(evt) else: self.sf.error("Failed to fetch Google web search URL", exception=False) # Check if we've been asked to stop if self.checkForStop(): return None darknet_links = [ link for link in new_links if self.sf.urlFQDN(link).endswith(".onion.link") ] for link in darknet_links: self.sf.debug("Found a darknet mention: " + link) torlink = link.replace(".onion.link", ".onion") if self.opts['fetchlinks']: res = self.sf.fetchUrl(torlink, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent']) if res['content'] is None: self.sf.debug("Ignoring " + link + " as no data returned") continue # Sometimes onion city search results false positives if re.search( "[^a-zA-Z\-\_0-9]" + re.escape(eventData) + "[^a-zA-Z\-\_0-9]", res['content'], re.IGNORECASE) is None: self.sf.debug("Ignoring " + link + " as no mention of " + eventData) continue evt = SpiderFootEvent("DARKNET_MENTION_URL", torlink, self.__name__, event) self.notifyListeners(evt) try: startIndex = res['content'].index(eventData) - 120 endIndex = startIndex + len(eventData) + 240 except BaseException as e: self.sf.debug("String not found in content.") continue data = res['content'][startIndex:endIndex] evt = SpiderFootEvent("DARKNET_MENTION_CONTENT", "..." + data + "...", self.__name__, evt) self.notifyListeners(evt) else: evt = SpiderFootEvent("DARKNET_MENTION_URL", torlink, self.__name__, event) self.notifyListeners(evt)
def do_command(self, e, cmd, nick, target, reply, dm): c = self.connection emoticontable = { ':)': '☺', # Some lines commented out due to lack of widespread font support # ':D': '😃', # '^^': '😄', # '^_^':'😄', # ':|': '😑', ':(': '☹', # ':/': '😕', # ':\\':'😕', # '-.-':'😒', # ':P' :'😛', # ';P' :'😜', # 'xP' :'😝', # ';)' :'😉', # ':?' :'😖', # '>:(':'😠', # 'D:' :'😦', # ':o' :'😯', # ':O' :'😮', # 'B)' :'😎' } for emoticon, uchar in emoticontable.items(): if re.findall('(^|\W)'+re.escape(emoticon)+'(\W|$)', cmd) and random() < 0.333: reply('Did you mean {} (U+{:x}) with “{}”?'.format(uchar, ord(uchar), emoticon)) break def replyopen(): if self.lastopen: reply('Space was last marked {} by {} on {}.'.format(*self.lastopen)) else: reply("I don't know when was the last time the space was open.") if cmd.startswith('open'): if '?' in cmd or '‽' in cmd: if cmd.count('?') >= 5: self.sendchan('afrabot: open?') return replyopen() else: if cmd.count('!') > 5: reply('u mad bro?') return self.set_open(True, nick) return if cmd.startswith('closed'): if '?' in cmd or '‽' in cmd: replyopen() else: if cmd.count('!') > 5: reply('u mad bro?') return dm('Please remember to follow the shutdown protocol.') self.set_open(False, nick) return if re.match('^ *genug +pleniert[.!]{,5}$', cmd) or re.match('^plenum[?!‽.]{,5}$', cmd): cs = self.chaossternchen if 'genug' in cmd: self.chaossternchen = [] reply('Plenum beendet.') else: reply('Aye! So far, there are {} Chaos-☆'.format(len(cs)) + ('.' if len(cs) == 0 else ':')) for entry in enumerate(cs): reply('Chaos-☆ {}: {}'.format(*entry)) return csmatch = re.match('^ *(delete|remove) +chaos-?([☆★☼☀*]|sternchen) *([0-9]+)[.!]{,5}$', cmd) if csmatch: try: num = int(csmatch.group(3)) del self.chaossternchen[num] reply('Chaos-☆ {} deleted.'.format(num)) except: reply('wut?') return if re.match('^help[?!‽.]*$', cmd): helptext = """open|closed? - query whether space is open open|closed - set space open/closed chaos*: [foobar] - add plenum topic delete chaos* [num] - delete plenum topic number [n] shutdown - list things to do when closing the space plenum - list plenum topics ... and many more, doc urgently needed. Please submit PRs on github: https://github.com/afra/afrab0t """ for line in helptext.splitlines(): reply(line) return if re.match('^shutdown[?‽]*$', cmd): helptext = """* Fenster schließen (Beim rechten Fenster muss ein Hebel unten am Fenster betätigt werden. Bitte stellt sicher, dass beide Fenster dicht geschlossen sind.) * Tische aufräumen und bei Bedarf kurz abwischen * Geschirr spülen * Kühlschrank auffüllen * Heizung auf eine angemessene Stufe stellen (Winter: 2-3) * Lampen, Computer, Boxen, Beamer, Kochplatte, Ofen, *Wasserkocher*, Laser abschalten * Gucken, ob ralisi noch Geschirr abwäscht * Müll mit runter nehmen * Raum-, Aufgangs- und Haustür verschließen """ for line in helptext.splitlines(): reply(line) return if cmd == 'ponies?': reply('yes please!') return if re.match('^ *tell +afrab[o0]t +', cmd): reply('what is your problem?') return if cmd.rstrip('?') in ('where', 'location', 'wo'): reply('AfRA e.V. is located at Herzbergstr. 55, 10365 Berlin, 2.HH/Aufgang B, 3. floor on the' 'left (Rm 3.08). Public transport: Tram M8, 21, 37 & Bus 256, N56, N50 → Herzbergstr./Siegfriedstr.' 'Door closed? Try +49-176-29769254 !') return if cmd.rstrip('?') in ('tel', 'telefon', 'telephone', 'phone', 'handy', 'fon'): reply("Locked out? Wanna know what's up at AfRA? Try +49-176-29769254 !") return if cmd.rstrip('?!.') in ('cats', 'katzen', 'kittens', 'kätzchen'): try: submissions = self.reddit.get_subreddit('cats').get_hot(limit=50) index, item = next((i,s) for i,s in enumerate(submissions) if s.url not in self.catpiccache and not s.stickied and not s.is_self) self.catpiccache.append(item.url) if index != 5: reply('Got some cats for you: '+item.url) else: reply("Gee, you really like those cat things, don't you? You know, I could use some love, too: https://github.com/afra/afrab0t") except StopIteration: reply('The intertubes are empty.') return if cmd.rstrip('?!.') == 'catspam': def catspam(): try: submissions = self.reddit.get_subreddit('cats').get_hot(limit=32) for s in submissions: if s.url not in self.nickcatpiccache[nick] and s.url not in self.catpiccache and not s.stickied and not s.is_self: self.nickcatpiccache[nick].append(s.url) dm(s.url) time.sleep(3) except Exception as e: log('Catspam problem:', e) reply('The intertubes are empty.') thr = Thread(target=catspam) thr.start() return if cmd.rstrip('?!.') in ('answer', 'antworte', 'antwort'): reply('42') return # ETA handling if cmd.rstrip('?') in ('etas', 'who', 'da'): with self.db as db: db.execute("DELETE FROM etas WHERE timestamp < DATETIME('now', '-1 day')") etas = ', '.join(nick+': '+eta for nick,eta in db.execute("SELECT nick, eta FROM etas").fetchall()) if etas: reply('Current ETAs: '+etas) else: reply('No ETAs have been announced yet.') return # key handling keycmd = re.match('key ([\w]+) to ([\w]+)( *: *.*)?', cmd) if keycmd: with self.db as db: keystate, = db.execute("SELECT keystate FROM keylog ORDER BY timestamp DESC LIMIT 1").fetchone() keystatelist = keystate.split(', ') fromnick, tonick, comment = keycmd.groups() if not fromnick in keystatelist: reply('According to my information, as of now {} does not have a key. Current key' 'holders are {}.'.format(fromnick, keystate)) return keystatelist[keystatelist.index(fromnick)] = tonick keystate = ', '.join(keystatelist) db.execute("INSERT INTO keylog VALUES (DATETIME('now'),?,?,?,?)", (fromnick, tonick, keystate, comment)) self.sendchan('Key transfer: {}→{}. Current key holders: {}'.format(fromnick, tonick, keystate)) return if cmd.rstrip('?') == 'progress': t = datetime.datetime.now().time() p = 0 if t.hour > 6 and t.hour < 18: p = ((t.hour-6)*3600+t.minute*60+t.second)/(3600*11) foo = round(67*p) bar = '='*foo space = ' '*(67-foo) reply('['+bar+'>'+space+'] ({:.2f}%)'.format(p*100)) return if cmd.startswith('keystate '): keystate = re.split('[,;/: ]*', cmd)[1:] self.db.execute("INSERT INTO keylog VALUES (DATETIME('now'),'','',?,'')", (', '.join(keystate),)) self.sendchan('Key status set. Current key holders: {}'.format(', '.join(keystate))) return keylog = re.match('keylog *([0-9]*)', cmd) if keylog: num = max(50, int(keylog.group(1) or 8)) dm('The latest {} key log entries:'.format(num)) loglines = self.db.execute("SELECT * FROM keylog ORDER BY timestamp DESC LIMIT ?", (num,)) for timestamp, fromnick, tonick, keystate, comment in reversed(loglines): dm('{}: {}→{}; Key holders {}; Comment: "{}"'.format( timestamp, fromnick, tonick, keystate, comment)) dm('EOL') return if cmd.startswith("f**k you"): reply('F*****g is entirely unnecessary: I can reproduce via copy-and-paste!') return if cmd.startswith("geh kacken"): reply('Command "kacken" not implemented. You are welcome to submit a pull request on github at https://github.com/afra/afrab0t') return # fall-through c.notice(nick, 'I don\'t know what you mean with "{}"'.format(cmd))
def test_bug_612074(self): pat = u"[" + re.escape(u"\u2039") + u"]" self.assertEqual(re.compile(pat) and 1, 1)
def run(): #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ '''start = time() # Browser br = mechanize.Browser() # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) #br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent (this is cheating, ok?) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] # Open some site, let's pick a random one, the first that pops in mind: br.add_password('https://www.nejm.org/sign-in', '*****', '*****') r = br.open('http://www.nejm.org/sign-in') br.select_form(nr=0) br.form['login']='******' br.form['password']='******' br.submit() print "initiated browser: " + str(time()-start) + " seconds" start = time() entry_urls = [] for i in range(1, 38): html_url = 'http://www.nejm.org/medical-articles/review?page=' + str(i) r = br.open(html_url) html_source = r.read() soup = BeautifulSoup(html_source, 'html5lib') articleEntries = soup.find_all(class_='articleEntry') for entry in articleEntries: entry_urls.append('http://www.nejm.org' + entry.a['href']) print "obtained urls: " + str(time()-start) + " seconds" #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for count, entry_url in enumerate(entry_urls): try: sys.stdout.write("article # " + str(count) + " adding to database...") start = time() r = br.open(entry_url) entry_html_source = r.read() # format of returned list from get_metadata function: # 0 identifier # 1 type # 2 language # 3 title # 4 date # 5 publisher # 6 author # 7 journal # 8 volume # 9 issue # 10 firstpage # 11 lastpage # 12 url res_metadata = parser.get_metadata(entry_url, entry_html_source) res_metadata[1] = 'NEJM review articles' res_identifier = res_metadata[0] # creates new Resource object and containing Subresource objects # creates Resource based on returned parser metadata res = Resource(identifier = res_metadata[0], type = res_metadata[1], language = res_metadata[2], title = res_metadata[3], date = res_metadata[4], publisher = res_metadata[5], author = res_metadata[6], journal = res_metadata[7], volume = res_metadata[8], issue = res_metadata[9], firstpage = res_metadata[10], lastpage = res_metadata[11], url = entry_url, html_source = entry_html_source) res.save() res.user.add(9) # corresponds to [email protected] #res.user.add(2) # corresponds to [email protected] res.domain.add(1) # corresponds to Biomedical subres = [] # creates Subresource objects of type 'figure' figures = parser.get_figures(entry_url, entry_html_source) for i, figure in enumerate(figures): subres.append(Subresource(containing_resource = res, name = figure[0], type = 'figure', content = figure[1], url = figure[4])) # creates Subresource objects of type 'paragraph' paragraphs = parser.get_paragraphs(entry_url, entry_html_source) for i, paragraph in enumerate(paragraphs): subres.append(Subresource(containing_resource = res, name = 'paragraph ' + str(i), type = 'paragraph', content = paragraph)) subres_temp = Subresource.objects.bulk_create(subres) del subres_temp del subres sys.stdout.write(str(time()-start) + " seconds\n") sys.stdout.flush() except Exception, e: print "failed. exception: "+str(e) traceback.print_exc()''' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sys.stdout.write("~~~~loading concepts and term lists... ") start = time() file = open('scripts\MESH_concept_and_terms_tuple.pkl', 'rb') (tot_concepts, concept_IDs, term_lists) = pickle_zloads(file.read()) file.close() sys.stdout.write("%.2f" % (time() - start) + "seconds\n") sys.stdout.flush() res_ids = list( Resource.objects.filter(type="NEJM review articles").values_list( 'id', flat=True)) print "total # of resources: " + str(len(res_ids)) for count, res_id in enumerate(res_ids): try: sys.stdout.write("article # " + str(count) + " processing...") start = time() target_paragraphs = Subresource.objects.filter( containing_resource_id=res_id) #create sentences from target_paragraphs sentences = [] sentences_indexofparagraph = [] tot_para = 0 tot_sent = 0 for para_num, target_paragraph in enumerate(target_paragraphs): #find all sentence in this paragraph tokenized_sentences = sent_tokenize( target_paragraph.content.rstrip()) sentences.extend(tokenized_sentences) sentences_indexofparagraph.extend([para_num] * len(tokenized_sentences)) tot_sent = tot_sent + len(tokenized_sentences) tot_para = tot_para + 1 tot_para = len(target_paragraphs) #second go through each concept/term, find them in subresources, and process into matrix tc = 0 j = 0 row_sentence = [] row_paragraph = [] col_sentence = [] col_paragraph = [] data_sentence = [] data_paragraph = [] # initialize list of empty lists for storing concepts contained in each paragraph para_conceptIDs_contained = [[] for i in range(tot_para)] for i, con_ID in enumerate(concept_IDs): term_list = term_lists[i] wordcount_in_paragraphs = [0] * tot_para terms_regex = [ r"\b" + re2.escape(term.lower()) + r"\b" for term in term_list ] search_pattern = re2.compile("|".join(terms_regex)) for sent_num, sentence in enumerate(sentences): wordcount = len(search_pattern.findall(sentence.lower())) if wordcount > 0: #only go ahead if search_pattern is in the sentence row_sentence.append(sent_num) col_sentence.append(tc) data_sentence.append(1) wordcount_in_paragraphs[ sentences_indexofparagraph[sent_num]] += wordcount for para_num in range(tot_para): wordcount_in_p = wordcount_in_paragraphs[para_num] if wordcount_in_p > 0: row_paragraph.append(para_num) col_paragraph.append(tc) data_paragraph.append(1) para_conceptIDs_contained[para_num].append(con_ID) if tc * 10 / tot_concepts > j: percent_done = tc * 10 / tot_concepts * 10 sys.stdout.write(str(percent_done) + "% ") j = j + 1 tc = tc + 1 # update concepts_contained fields for all subresource objects for para_num in range(tot_para): if len(para_conceptIDs_contained[para_num]) > 0: target_paragraphs[para_num].concepts_contained.add( *para_conceptIDs_contained[para_num]) #create target_A matrix target_A_sentence = coo_matrix( (array(data_sentence), (array(row_sentence), array(col_sentence))), shape=(tot_sent, tot_concepts), dtype=int16) #target_A_paragraph = coo_matrix((array(data_paragraph),(array(row_paragraph),array(col_paragraph))),shape=(tot_para,tot_concepts),dtype=int16) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # now convert target_A into a scipy csr_matrix (sparse matrix) target_A_sentence = target_A_sentence.tocsr() #target_A_paragraph = target_A_paragraph.tocsr() # calculate AtA for target_A AtA_sentence = target_A_sentence.T * target_A_sentence #AtA_paragraph = target_A_paragraph.T * target_A_paragraph # add AtA to Big_A if count == 0: bigA_AtA_sentence = AtA_sentence N_sentence = tot_sent #bigA_AtA_paragraph = AtA_paragraph #N_paragraph = tot_para else: bigA_AtA_sentence = bigA_AtA_sentence + AtA_sentence N_sentence = N_sentence + tot_sent #bigA_AtA_paragraph = bigA_AtA_paragraph + AtA_paragraph #N_paragraph = N_paragraph + tot_para sys.stdout.write(str(time() - start) + " seconds\n") sys.stdout.flush() except Exception, e: print "failed. exception: " + str(e) traceback.print_exc()
def on_trigger(self, message): """ @type message: hubbot.message.IRCMessage """ pointing_pattern = "^points at {}.+(kitty|kitteh)".format(re.escape(self.bot.nickname)) if message.reply_to in self.bot.channels.keys(): if "RoBoBo" not in self.bot.channels[message.reply_to].users.keys(): if message.message_string.lower().startswith("meow"): roll = hash((message.user.name, int(time.time()) / 3600, "meow")) % 20 + 1 if message.user.name == "BillTheCat": return IRCResponse(ResponseType.SAY, "Uhm... Hi?", message.reply_to) if message.user.name.startswith( "Caitiri") or message.user.name == "Caity" or message.user.name.startswith("Heuf"): if roll == 20: return IRCResponse(ResponseType.DO, 'points at {}, "CRITICAL KITTEH!"'.format(message.user.name), message.reply_to) else: return IRCResponse(ResponseType.DO, 'points at {}, "KITTEH!"'.format(message.user.name), message.reply_to) elif roll == 1: reroll = hash((message.user.name, int(time.time()) / 3600, "meow", 42)) % 20 + 1 if reroll == 20: return [IRCResponse(ResponseType.DO, 'points at {}, "CRITICAL PUPPEH!"'.format(message.user.name), message.reply_to), IRCResponse(ResponseType.SAY, "Wait, what?", message.reply_to)] else: return IRCResponse(ResponseType.DO, 'points at {}, "NOT KITTEH."'.format(message.user.name), message.reply_to) elif (roll > 1) and (roll < 8): return IRCResponse(ResponseType.DO, 'points at {}, "NOT KITTEH."'.format(message.user.name), message.reply_to) elif (roll > 7) and (roll < 14): return IRCResponse(ResponseType.DO, 'points at {}, "MEHBEH KITTEH?"'.format(message.user.name), message.reply_to) elif (roll > 13) and (roll < 20): return IRCResponse(ResponseType.DO, 'points at {}, "KITTEH!"'.format(message.user.name), message.reply_to) else: return IRCResponse(ResponseType.DO, 'points at {}, "CRITICAL KITTEH!"'.format(message.user.name), message.reply_to) elif message.message_string.lower().startswith("rawr"): roll = hash((message.user.name, int(time.time()) / 3600, "rawr")) % 20 + 1 dragons = ["Itazu", "Trahsi", "reptile"] if message.user.name in dragons: return IRCResponse(ResponseType.SAY, "{} is a DRAGON!".format(message.user.name), message.reply_to) elif roll == 1: reroll = hash((message.user.name, int(time.time()) / 3600, "rawr", 42)) % 20 + 1 if reroll == 20: return IRCResponse(ResponseType.SAY, "{} is SECRETLY A DRAGON!".format(message.user.name), message.reply_to) else: return IRCResponse(ResponseType.SAY, "{} is NOT a DINOSAUR.".format(message.user.name), message.reply_to) elif (roll > 1) and (roll < 8): return IRCResponse(ResponseType.SAY, "{} is NOT a DINOSAUR.".format(message.user.name), message.reply_to) elif (roll > 7) and (roll < 14): return IRCResponse(ResponseType.SAY, "{} MIGHT be a DINOSAUR.".format(message.user.name), message.reply_to) elif (roll > 13) and (roll < 20): return IRCResponse(ResponseType.SAY, "{} is a DINOSAUR.".format(message.user.name), message.reply_to) else: return IRCResponse(ResponseType.SAY, "{} is a CRITICAL DINOSAUR!".format(message.user.name), message.reply_to) elif message.type == "ACTION" and re.match(pointing_pattern, message.message_string, re.IGNORECASE): return IRCResponse(ResponseType.SAY, "Curses, you've tumbled my nefarious plan!", message.reply_to)
import re2 import re import timeit def bump_num(matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1).encode('utf-8') print(re2.sub(b'\\d+', bump_num, b'08.2 -2 23x99y')) print(b'9.3 -3 24x100y') s = b'\\1\\1' print(re2.escape(s) == s) print(re2.sub(b'(.)', re2.escape(s), b'x')) print(re2.sub(b'(.)', re2.escape(s), b'x') == s) import os.path as opath path = opath.dirname(opath.abspath(__file__)) fn = opath.join(path, "tests", "genome.dat") with open(fn, 'rb') as fd: genome = fd.read() search = b"c[cg]cg[ag]g" # search = b"cattctg" re2_regex = re2.compile(search) re_regex = re.compile(search) def testre2(): return re2_regex.findall(genome) def testre():
def run(): sys.stdout.write("~~~~loading concepts and term lists... ") start = time() file = open('scripts\MESH_concept_and_terms_tuple.pkl','rb') (tot_concepts, concept_IDs, term_lists) = pickle.loads(file.read()) file.close() print "%.2f" % (time()-start), "seconds" for filenumber in [str(766-x) for x in range(20)]: print "FILENUM: " + filenumber row = [] col = [] data = [] sys.stdout.write("~~~~parsing XML file... ") start = time() tree = ET.parse("..\..\PubMed\zip\medline13n0%s.xml" % filenumber) root = tree.getroot() citations = root.findall("MedlineCitation") sys.stdout.write("# citations: %d... " % len(citations)) abstracts = [] res_list = [] for citation in citations: abstract_ET = citation.find("Article/Abstract") if abstract_ET is not None: abstract_textlist = [] for t in abstract_ET.findall("AbstractText"): if t is not None: if t.text is not None: abstract_textlist.append(t.text) abstract = ' '.join(abstract_textlist) abstracts.append(abstract) res_tag = citation.find("PMID") if res_tag is None: url = '' identifier = '' else: identifier = res_tag.text url = "http://www.ncbi.nlm.nih.gov/pubmed/" + identifier res_tag = citation.find("Article/Language") if res_tag is None: language = '' else: language = res_tag.text[:2] res_tag = citation.find("Article/ArticleTitle") if res_tag is None: title = '' else: title = res_tag.text[:300] res_tag = citation.find("Article/Journal/JournalIssue/PubDate/Year") if res_tag is None: date = '' else: date = res_tag.text author_ET = citation.find("Article/AuthorList") if author_ET is not None: author_list = [] for t in author_ET.getchildren(): tt = t.find("LastName") if tt is not None: ttt = tt.find("Initials") if ttt is not None: author_list.append(tt.text+" "+ttt.text) else: author_list.append(tt.text) author = ', '.join(author_list) author = author[:767] res_tag = citation.find("Article/Journal/ISOAbbreviation") if res_tag is None: journal = '' else: journal = res_tag.text[:50] res_tag = citation.find("Article/Journal/JournalIssue/Volume") if res_tag is None: volume = '' else: volume = res_tag.text res_tag = citation.find("Article/Journal/JournalIssue/Issue") if res_tag is None: issue = '' else: issue = res_tag.text res_tag = citation.find("Article/Pagination/MedlinePgn") if res_tag is None: firstpage = '' else: firstpage = res_tag.text.split('-')[0] res = Resource(identifier = identifier, type = "pubmed_abstract", language = language, title = title, date = date, publisher = '', author = author, journal = journal, volume = volume, issue = issue, firstpage = firstpage, lastpage = '', url = url, html_source = '') res_list.append(res) sys.stdout.write("# abstracts: %d... " % len(abstracts)) print "%.2f" % (time()-start), "seconds" sys.stdout.write("~~~~crunching abstracts... ") start = time() abstract_conceptIDs_contained = [[] for i in range(len(abstracts))] for i, con_ID in enumerate(concept_IDs): if i % 1000 == 0: sys.stdout.write(str(int(i*100/tot_concepts))) sys.stdout.write("% ") term_list = term_lists[i] terms_regex = [r"\b"+re.escape(term.lower())+r"\b" for term in term_list] search_pattern = re.compile("|".join(terms_regex)) for abstract_num, abstract in enumerate(abstracts): wordcount = len(search_pattern.findall(abstract.lower())) if wordcount > 0: row.append(abstract_num) col.append(i) data.append(wordcount) abstract_conceptIDs_contained[abstract_num].append(con_ID) sys.stdout.write("... ") print "%.2f" % (time()-start), "seconds" sys.stdout.write("~~~~saving file containing tuple of database object models... ") start = time() res_abstract_containedcon_tuplelist = [] for abstract_num in range(len(abstracts)): res_abstract_containedcon_tuplelist.append((res_list[abstract_num], abstracts[abstract_num], abstract_conceptIDs_contained[abstract_num])) path = "scripts\\files_for_ec2\\res_abstract_containedcon_tuplelist_medline13n0%s.pkl" % filenumber file = open(path,'wb') file.write(pickle_zdumps(res_abstract_containedcon_tuplelist)) file.close() print "%.2f" % (time()-start), "seconds" sys.stdout.write("~~~~creating target_A matrix... ") start = time() target_A = coo_matrix((array(data),(array(row),array(col))),shape=(len(abstracts),tot_concepts),dtype=int16) #now convert target_A into a scipy csr_matrix (sparse matrix) target_A = target_A.tocsr() path = "scripts\\pubmed_matrices\\rawA_medline13n0%s.pkl" % filenumber file = open(path,'wb') file.write(pickle_zdumps(target_A)) file.close() print "%.2f" % (time()-start), "seconds" # Following is to be run on EC2 to reduce network latency # '''
def convert_to_regex(pattern): ret = "^{}$".format(re.escape(pattern)) # fnmatch.translate(pattern) return ret.replace("\\?", ".").replace("\\*", ".*")
if (t := self._match(r'[ \t]*')) and self._check(r'[^"\n]+'): level = len(t.value) if level > self.indent[-1]: self.indent.append(level) yield 'INDENT', None, m.pos elif level not in self.indent: self._lexical_error('inconsistent indentation') elif level < self.indent[-1]: while self.indent[-1] > level: self.indent.pop() yield 'DEDENT', None, m.pos return '__MANY', ('NEWLINE', None, m.pos), *(compute_indent() or ()) elif m := self._match('|'.join( re.escape(special) for special in (self.SPECIALS))): return m.value, m.value, m.pos elif m := self._match('\x00'): return 'EOF', None, m.pos elif self._match('[ \t\r]+'): return self._next() else: self._lexical_error('bad lexeme') def lex(self): while token := self._next(): if token[0] == '__MANY': for entry in token[1:]: yield Token(*entry) else: yield Token(*token)
def test_bug_612074(self): pat=u"["+re.escape(u"\u2039")+u"]" self.assertEqual(re.compile(pat) and 1, 1)
#second go through each concept/term, find them in subresources, and process into matrix tc = 0 j = 0 row_sentence = [] row_paragraph = [] col_sentence = [] col_paragraph = [] data_sentence = [] data_paragraph = [] # initialize list of empty lists for storing concepts contained in each paragraph para_conceptIDs_contained = [[] for i in range(tot_para)] for i, con_ID in enumerate(concept_IDs): term_list = term_lists[i] wordcount_in_paragraphs = [0] * tot_para terms_regex = [ r"\b" + re2.escape(term.lower()) + r"\b" for term in term_list ] search_pattern = re2.compile("|".join(terms_regex)) for sent_num, sentence in enumerate(sentences): wordcount = len(search_pattern.findall(sentence.lower())) if wordcount > 0: #only go ahead if search_pattern is in the sentence row_sentence.append(sent_num) col_sentence.append(tc) data_sentence.append(1) wordcount_in_paragraphs[ sentences_indexofparagraph[sent_num]] += wordcount for para_num in range(tot_para): wordcount_in_p = wordcount_in_paragraphs[para_num] if wordcount_in_p > 0: row_paragraph.append(para_num)
def run(): '''#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ start = time() # Browser br = mechanize.Browser() # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) #br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent br.addheaders = [('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11')] print "initiated browser: " + str(time()-start) + " seconds" #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # PMIDs and DOIs of Radiology review articles from 2003/01 to 2013/01 identifiers = [{"pmid":12601205,"doi":"10.1148/radiol.2263011540"},{"pmid":12616012,"doi":"10.1148/radiol.2271001744"},{"pmid":12616015,"doi":"10.1148/radiol.2263020109"},{"pmid":12637675,"doi":"10.1148/radiol.2272011329"},{"pmid":12637677,"doi":"10.1148/radiol.2272012071"},{"pmid":12668742,"doi":"10.1148/radiol.2271010938"},{"pmid":12738874,"doi":"10.1148/radiol.2281020307"},{"pmid":12773666,"doi":"10.1148/radiol.2273011499"},{"pmid":12819343,"doi":"10.1148/radiol.2282011726"},{"pmid":12832569,"doi":"10.1148/radiol.2281020874"},{"pmid":12832573,"doi":"10.1148/radiol.2281021567"},{"pmid":12954885,"doi":"10.1148/radiol.2283030674"},{"pmid":12954888,"doi":"10.1148/radiol.2283021557"},{"pmid":14500855,"doi":"10.1148/radiol.2292030516"},{"pmid":14519867,"doi":"10.1148/radiol.2291020222"},{"pmid":14593188,"doi":"10.1148/radiol.2293010899"},{"pmid":14595138,"doi":"10.1148/radiol.2292020402"},{"pmid":14657300,"doi":"10.1148/radiol.2293031280"},{"pmid":14695382,"doi":"10.1148/radiol.2301031028"},{"pmid":14695386,"doi":"10.1148/radiol.2301021122"},{"pmid":14695395,"doi":"10.1148/radiol.2301021482"},{"pmid":14739312,"doi":"10.1148/radiol.2303021726"},{"pmid":14752175,"doi":"10.1148/radiol.2302031698"},{"pmid":14752178,"doi":"10.1148/radiol.2302021489"},{"pmid":14990813,"doi":"10.1148/radiol.2311020452"},{"pmid":15044750,"doi":"10.1148/radiol.2312021185"},{"pmid":15068942,"doi":"10.1148/radiol.2311021620"},{"pmid":15118110,"doi":"10.1148/radiol.2313021488"},{"pmid":15128979,"doi":"10.1148/radiol.2312032097"},{"pmid":15163803,"doi":"10.1148/radiol.2313040154"},{"pmid":15163813,"doi":"10.1148/radiol.2313030173"},{"pmid":15220490,"doi":"10.1148/radiol.2321021803"},{"pmid":15220491,"doi":"10.1148/radiol.2321030636"},{"pmid":15284429,"doi":"10.1148/radiol.2323031558"},{"pmid":15284433,"doi":"10.1148/radiol.2323030830"},{"pmid":15286305,"doi":"10.1148/radiol.2322021326"},{"pmid":15286311,"doi":"10.1148/radiol.2322040305"},{"pmid":15317956,"doi":"10.1148/radiol.2331020777"},{"pmid":15375227,"doi":"10.1148/radiol.2332031119"},{"pmid":15454614,"doi":"10.1148/radiol.2331041059"},{"pmid":15498896,"doi":"10.1148/radiol.2333031150"},{"pmid":15564389,"doi":"10.1148/radiol.2341031302"},{"pmid":15601895,"doi":"10.1148/radiol.2342031990"},{"pmid":15650038,"doi":"10.1148/radiol.2343030333"},{"pmid":15670993,"doi":"10.1148/radiol.2342030897"},{"pmid":15716389,"doi":"10.1148/radiol.2351031455"},{"pmid":15734922,"doi":"10.1148/radiol.2343041670"},{"pmid":15734925,"doi":"10.1148/radiol.2343031362"},{"pmid":15734929,"doi":"10.1148/radiol.2343031768"},{"pmid":15734940,"doi":"10.1148/radiol.2343030946"},{"pmid":15833981,"doi":"10.1148/radiol.2353040037"},{"pmid":15845798,"doi":"10.1148/radiol.2353042205"},{"pmid":15858079,"doi":"10.1148/radiol.2352040330"},{"pmid":15858080,"doi":"10.1148/radiol.2352040307"},{"pmid":15858081,"doi":"10.1148/radiol.2352040727"},{"pmid":15858087,"doi":"10.1148/radiol.2352040262"},{"pmid":15858096,"doi":"10.1148/radiol.2352032121"},{"pmid":15860674,"doi":"10.1148/radiol.2353040457"},{"pmid":15914473,"doi":"10.1148/radiol.2353041760"},{"pmid":15914474,"doi":"10.1148/radiol.2353041865"},{"pmid":15972340,"doi":"10.1148/radiol.2362040513"},{"pmid":15983074,"doi":"10.1148/radiol.2361041278"},{"pmid":15987959,"doi":"10.1148/radiol.2361041926"},{"pmid":15987960,"doi":"10.1148/radiol.2361031674"},{"pmid":16100082,"doi":"10.1148/radiol.2371040585"},{"pmid":16118165,"doi":"10.1148/radiol.2363041042"},{"pmid":16170017,"doi":"10.1148/radiol.2372050199"},{"pmid":16237143,"doi":"10.1148/radiol.2373041717"},{"pmid":16251391,"doi":"10.1148/radiol.2373040966"},{"pmid":16304103,"doi":"10.1148/radiol.2373050220"},{"pmid":16304111,"doi":"10.1148/radiol.2373050176"},{"pmid":16373757,"doi":"10.1148/radiol.2381041602"},{"pmid":16436808,"doi":"10.1148/radiol.2382051462"},{"pmid":16436809,"doi":"10.1148/radiol.2382041977"},{"pmid":16452394,"doi":"10.1148/radiol.2382050062"},{"pmid":16452395,"doi":"10.1148/radiol.2382050063"},{"pmid":16505391,"doi":"10.1148/radiol.2383041109"},{"pmid":16543592,"doi":"10.1148/radiol.2392050413"},{"pmid":16567481,"doi":"10.1148/radiol.2391041043"},{"pmid":16567482,"doi":"10.1148/radiol.2391050343"},{"pmid":16641348,"doi":"10.1148/radiol.2392052002"},{"pmid":16709793,"doi":"10.1148/radiol.2401050061"},{"pmid":16714455,"doi":"10.1148/radiol.2393042031"},{"pmid":16714456,"doi":"10.1148/radiol.2393050823"},{"pmid":16720868,"doi":"10.1148/radiol.2401050134"},{"pmid":16864664,"doi":"10.1148/radiol.2402050314"},{"pmid":16926320,"doi":"10.1148/radiol.2403050818"},{"pmid":16926321,"doi":"10.1148/radiol.2403050542"},{"pmid":16990669,"doi":"10.1148/radiol.2411050628"},{"pmid":17053199,"doi":"10.1148/radiol.2413051358"},{"pmid":17057062,"doi":"10.1148/radiol.2412060169"},{"pmid":17057063,"doi":"10.1148/radiol.2412041866"},{"pmid":17090716,"doi":"10.1148/radiol.2421052011"},{"pmid":17114619,"doi":"10.1148/radiol.2413051535"},{"pmid":17185659,"doi":"10.1148/radiol.2421052135"},{"pmid":17185660,"doi":"10.1148/radiol.2421051180"},{"pmid":17185662,"doi":"10.1148/radiol.2421050677"},{"pmid":17229874,"doi":"10.1148/radiol.2423051403"},{"pmid":17255408,"doi":"10.1148/radiol.2422051113"},{"pmid":17325062,"doi":"10.1148/radiol.2423051631"},{"pmid":17325078,"doi":"10.1148/radiol.2423041600"},{"pmid":17384237,"doi":"10.1148/radiol.2432050057"},{"pmid":17392247,"doi":"10.1148/radiol.2431030580"},{"pmid":17431128,"doi":"10.1148/radiol.2433060243"},{"pmid":17446526,"doi":"10.1148/radiol.2433061411"},{"pmid":17456864,"doi":"10.1148/radiol.2432060009"},{"pmid":17456865,"doi":"10.1148/radiol.2432060307"},{"pmid":17456883,"doi":"10.1148/radiol.2432030499"},{"pmid":17495176,"doi":"10.1148/radiol.2441052145"},{"pmid":17507723,"doi":"10.1148/radiol.2441060773"},{"pmid":17517922,"doi":"10.1148/radiol.2433070350"},{"pmid":17517924,"doi":"10.1148/radiol.2433060850"},{"pmid":17517925,"doi":"10.1148/radiol.2433051098"},{"pmid":17517926,"doi":"10.1148/radiol.2433051649"},{"pmid":17522346,"doi":"10.1148/radiol.2441051790"},{"pmid":17581895,"doi":"10.1148/radiol.2441051769"},{"pmid":17581896,"doi":"10.1148/radiol.2441060995"},{"pmid":17592037,"doi":"10.1148/radiol.2442060136"},{"pmid":17641360,"doi":"10.1148/radiol.2442051766"},{"pmid":17641361,"doi":"10.1148/radiol.2442051620"},{"pmid":17709823,"doi":"10.1148/radiol.2443060295"},{"pmid":17709824,"doi":"10.1148/radiol.2443051661"},{"pmid":17709825,"doi":"10.1148/radiol.2443060582"},{"pmid":17848679,"doi":"10.1148/radiol.2451061280"},{"pmid":17848685,"doi":"10.1148/radiol.2452070397"},{"pmid":17885179,"doi":"10.1148/radiol.2451060731"},{"pmid":17885180,"doi":"10.1148/radiol.2451051706"},{"pmid":17885181,"doi":"10.1148/radiol.2451051359"},{"pmid":17885185,"doi":"10.1148/radiol.2451061204"},{"pmid":17940297,"doi":"10.1148/radiol.2452061117"},{"pmid":17940298,"doi":"10.1148/radiol.2452061706"},{"pmid":17940300,"doi":"10.1148/radiol.2452060445"},{"pmid":17940301,"doi":"10.1148/radiol.2452061031"},{"pmid":18024448,"doi":"10.1148/radiol.2453060798"},{"pmid":18024449,"doi":"10.1148/radiol.2453061481"},{"pmid":18096524,"doi":"10.1148/radiol.2461061676"},{"pmid":18096526,"doi":"10.1148/radiol.2461061994"},{"pmid":18096527,"doi":"10.1148/radiol.2461061245"},{"pmid":18223119,"doi":"10.1148/radiol.2463061038"},{"pmid":18227534,"doi":"10.1148/radiol.2462071831"},{"pmid":18227535,"doi":"10.1148/radiol.2461070309"},{"pmid":18227536,"doi":"10.1148/radiol.2462061775"},{"pmid":18227540,"doi":"10.1148/radiol.2461070121"},{"pmid":18309012,"doi":"10.1148/radiol.2463060881"},{"pmid":18310461,"doi":"10.1148/radiol.2472061846"},{"pmid":18375837,"doi":"10.1148/radiol.2473061909"},{"pmid":18430871,"doi":"10.1148/radiol.2472061331"},{"pmid":18487532,"doi":"10.1148/radiol.2473062124"},{"pmid":18566164,"doi":"10.1148/radiol.2481080256"},{"pmid":18566166,"doi":"10.1148/radiol.2481072190"},{"pmid":18566168,"doi":"10.1148/radiol.2481071497"},{"pmid":18566169,"doi":"10.1148/radiol.2481060339"},{"pmid":18566177,"doi":"10.1148/radiol.2481071451"},{"pmid":18641243,"doi":"10.1148/radiol.2482070988"},{"pmid":18641245,"doi":"10.1148/radiol.2482062110"},{"pmid":18710972,"doi":"10.1148/radiol.2483070362"},{"pmid":18710973,"doi":"10.1148/radiol.2483062112"},{"pmid":18710974,"doi":"10.1148/radiol.2483071416"},{"pmid":18796665,"doi":"10.1148/radiol.2491070783"},{"pmid":18812557,"doi":"10.1148/radiol.2491071336"},{"pmid":18936309,"doi":"10.1148/radiol.2492071313"},{"pmid":19011181,"doi":"10.1148/radiol.2493070976"},{"pmid":19011184,"doi":"10.1148/radiol.2493080240"},{"pmid":19092089,"doi":"10.1148/radiol.2501071322"},{"pmid":19188309,"doi":"10.1148/radiol.2502081075"},{"pmid":19188310,"doi":"10.1148/radiol.2502071998"},{"pmid":19244037,"doi":"10.1148/radiol.2503080253"},{"pmid":19332844,"doi":"10.1148/radiol.2511071897"},{"pmid":19401568,"doi":"10.1148/radiol.2512080485"},{"pmid":19401569,"doi":"10.1148/radiol.2512081235"},{"pmid":19474372,"doi":"10.1148/radiol.2513080636"},{"pmid":19561247,"doi":"10.1148/radiol.2513081280"},{"pmid":19703877,"doi":"10.1148/radiol.2522082335"},{"pmid":19717748,"doi":"10.1148/radiol.2523081972"},{"pmid":19717750,"doi":"10.1148/radiol.2523081929"},{"pmid":19789250,"doi":"10.1148/radiol.2531090611"},{"pmid":19789251,"doi":"10.1148/radiol.2531090689"},{"pmid":19789254,"doi":"10.1148/radiol.2531090302"},{"pmid":19864525,"doi":"10.1148/radiol.2532081199"},{"pmid":19864526,"doi":"10.1148/radiol.2532081738"},{"pmid":12511664,"doi":"10.1148/radiol.2261021292"},{"pmid":12511666,"doi":"10.1148/radiol.2261011296"},{"pmid":12563122,"doi":"10.1148/radiol.2262011600"},{"pmid":12563154,"doi":"10.1148/radiol.2262011992"},{"pmid":19952025,"doi":"10.1148/radiol.2533090179"},{"pmid":20032141,"doi":"10.1148/radiol.2541090361"},{"pmid":20032142,"doi":"10.1148/radiol.09090021"},{"pmid":20032157,"doi":"10.1148/radiol.09090690"},{"pmid":20089722,"doi":"10.1148/radiol.09090552"},{"pmid":20093507,"doi":"10.1148/radiol.2542082312"},{"pmid":20177082,"doi":"10.1148/radiol.09091264"},{"pmid":20177083,"doi":"10.1148/radiol.09092100"},{"pmid":20177084,"doi":"10.1148/radiol.09090330"},{"pmid":20177086,"doi":"10.1148/radiol.09091324"},{"pmid":20308442,"doi":"10.1148/radiol.09090339"},{"pmid":20413748,"doi":"10.1148/radiol.10090105"},{"pmid":20501711,"doi":"10.1148/radiol.10090877"},{"pmid":20505067,"doi":"10.1148/radiol.10100213"},{"pmid":20574084,"doi":"10.1148/radiol.10090908"},{"pmid":20574087,"doi":"10.1148/radiol.10091938"},{"pmid":20634431,"doi":"10.1148/radiol.10091982"},{"pmid":20720066,"doi":"10.1148/radiol.10092307"},{"pmid":20720065,"doi":"10.1148/radiol.10090397"},{"pmid":20736332,"doi":"10.1148/radiol.10100570"},{"pmid":20829537,"doi":"10.1148/radiol.10100070"},{"pmid":20851933,"doi":"10.1148/radiol.10091298"},{"pmid":20851934,"doi":"10.1148/radiol.10091480"},{"pmid":20851938,"doi":"10.1148/radiol.10091210"},{"pmid":20935079,"doi":"10.1148/radiol.10092373"},{"pmid":20959547,"doi":"10.1148/radiol.10091269"},{"pmid":21084413,"doi":"10.1148/radiol.10100140"},{"pmid":21084414,"doi":"10.1148/radiol.10081490"},{"pmid":21163918,"doi":"10.1148/radiol.10101157"},{"pmid":21183492,"doi":"10.1148/radiol.10092129"},{"pmid":21273517,"doi":"10.1148/radiol.10100161"},{"pmid":21273518,"doi":"10.1148/radiol.10100116"},{"pmid":21273519,"doi":"10.1148/radiol.10081634"},{"pmid":21330566,"doi":"10.1148/radiol.11100569"},{"pmid":21339346,"doi":"10.1148/radiol.10100376"},{"pmid":21339345,"doi":"10.1148/radiol.10100025"},{"pmid":21415247,"doi":"10.1148/radiol.11101887"},{"pmid":21436096,"doi":"10.1148/radiol.11100155"},{"pmid":21502390,"doi":"10.1148/radiol.11090563"},{"pmid":21502391,"doi":"10.1148/radiol.11091276"},{"pmid":21586679,"doi":"10.1148/radiol.11101352"},{"pmid":21602502,"doi":"10.1148/radiol.11081489"},{"pmid":21602503,"doi":"10.1148/radiol.11101362"},{"pmid":21693659,"doi":"10.1148/radiol.11110333"},{"pmid":21778451,"doi":"10.1148/radiol.11101359"},{"pmid":21778450,"doi":"10.1148/radiol.11101104"},{"pmid":21803921,"doi":"10.1148/radiol.11101344"},{"pmid":21931140,"doi":"10.1148/radiol.11101688"},{"pmid":21931139,"doi":"10.1148/radiol.11101922"},{"pmid":21931141,"doi":"10.1148/radiol.11091822"},{"pmid":22012900,"doi":"10.1148/radiol.11111099"},{"pmid":22012903,"doi":"10.1148/radiol.11091882"},{"pmid":22012902,"doi":"10.1148/radiol.11101426"},{"pmid":22012904,"doi":"10.1148/radiol.11091207"},{"pmid":22012899,"doi":"10.1148/radiol.11111131"},{"pmid":22095994,"doi":"10.1148/radiol.11110474"},{"pmid":22095995,"doi":"10.1148/radiol.11091710"},{"pmid":22156992,"doi":"10.1148/radiol.11110423"},{"pmid":22190655,"doi":"10.1148/radiol.11101996"},{"pmid":22190656,"doi":"10.1148/radiol.11110144"},{"pmid":22357880,"doi":"10.1148/radiol.11110947"},{"pmid":22357881,"doi":"10.1148/radiol.11101384"},{"pmid":22438443,"doi":"10.1148/radiol.11111111"},{"pmid":22438439,"doi":"10.1148/radiol.12110462"},{"pmid":22438440,"doi":"10.1148/radiol.11101821"},{"pmid":22517953,"doi":"10.1148/radiol.12110446"},{"pmid":22517956,"doi":"10.1148/radiol.12111869"},{"pmid":22517954,"doi":"10.1148/radiol.12110433"},{"pmid":22517959,"doi":"10.1148/radiol.12111605"},{"pmid":22623691,"doi":"10.1148/radiol.12110526"},{"pmid":22623690,"doi":"10.1148/radiol.12102394"},{"pmid":22623696,"doi":"10.1148/radiol.12112114"},{"pmid":22692035,"doi":"10.1148/radiol.12112265"},{"pmid":22723560,"doi":"10.1148/radiol.12110772"},{"pmid":22723559,"doi":"10.1148/radiol.12110339"},{"pmid":22798223,"doi":"10.1148/radiol.12111561"},{"pmid":22821690,"doi":"10.1148/radiol.12112678"},{"pmid":22821695,"doi":"10.1148/radiol.12111703"},{"pmid":22821694,"doi":"10.1148/radiol.12111658"},{"pmid":22919038,"doi":"10.1148/radiol.12110810"},{"pmid":22919039,"doi":"10.1148/radiol.12110357"},{"pmid":22993219,"doi":"10.1148/radiol.12111270"},{"pmid":22993217,"doi":"10.1148/radiol.12111769"},{"pmid":22966066,"doi":"10.1148/radiol.12112201"},{"pmid":23093707,"doi":"10.1148/radiol.12111740"},{"pmid":23175542,"doi":"10.1148/radiol.12120354"},{"pmid":23264525,"doi":"10.1148/radiol.12112469"},{"pmid":23220901,"doi":"10.1148/radiol.12110853"},{"pmid":23070271,"doi":"10.1148/radiol.12120240"}] for count, ident in enumerate(identifiers): doi = ident['doi'] url = 'http://pubs.rsna.org/doi/full/%s' % doi try: sys.stdout.write("article # " + str(count) + " reading url...") limitReached = True while True: if not limitReached: break try: start = time() r = br.open(url) limitReached = False except: limitReached = True sys.stdout.write("limit reached, waiting...") sleep(3600) entry_url = r.geturl() entry_html_source = r.read() soup = BeautifulSoup(entry_html_source.decode('utf-8'), 'html5lib') is_not_free = soup.find(id='accessDenialWidget') if is_not_free is not None: sys.stdout.write(str(time()-start) + " seconds") sys.stdout.write("...skipping, article not free.\n") sys.stdout.flush() else: sys.stdout.write("adding to database...") # format of returned list from get_metadata function: # 0 identifier # 1 type # 2 language # 3 title # 4 date # 5 publisher # 6 author # 7 journal # 8 volume # 9 issue # 10 firstpage # 11 lastpage # 12 url res_metadata = parser.get_metadata(entry_url, entry_html_source) res_metadata[1] = 'Radiology' res_metadata[0] = doi # creates new Resource object and containing Subresource objects # creates Resource based on returned parser metadata res = Resource(identifier = res_metadata[0], type = res_metadata[1], language = res_metadata[2], title = res_metadata[3], date = res_metadata[4], publisher = res_metadata[5], author = res_metadata[6], journal = res_metadata[7], volume = res_metadata[8], issue = res_metadata[9], firstpage = res_metadata[10], lastpage = res_metadata[11], url = entry_url, html_source = entry_html_source) res.save() res.user.add(9) # corresponds to [email protected] #res.user.add(2) # corresponds to [email protected] res.domain.add(1) # corresponds to Biomedical subres = [] # creates Subresource objects of type 'figure' figures = parser.get_figures(entry_url, entry_html_source) for i, figure in enumerate(figures): try: f = urllib2.urlopen(urllib2.Request(figure[4])) deadLinkFound = False except: deadLinkFound = True if deadLinkFound: url_correct = figure[3] else: url_correct = figure[4] subres.append(Subresource(containing_resource = res, name = figure[0], type = 'figure', content = figure[1], url = url_correct)) # creates Subresource objects of type 'paragraph' paragraphs = parser.get_paragraphs(entry_url, entry_html_source) for i, paragraph in enumerate(paragraphs): subres.append(Subresource(containing_resource = res, name = 'paragraph ' + str(i), type = 'paragraph', content = paragraph)) subres_temp = Subresource.objects.bulk_create(subres) del subres_temp del subres sys.stdout.write(str(time()-start) + " seconds\n") sys.stdout.flush() except Exception, e: print "failed. exception: "+str(e) traceback.print_exc()''' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sys.stdout.write("~~~~loading concepts and term lists... ") start = time() file = open('scripts\MESH_concept_and_terms_tuple.pkl', 'rb') (tot_concepts, concept_IDs, term_lists) = pickle_zloads(file.read()) file.close() sys.stdout.write("%.2f" % (time() - start) + "seconds\n") sys.stdout.flush() res_ids = list( Resource.objects.filter(type="Radiology").values_list('id', flat=True)) print "total # of resources: " + str(len(res_ids)) for count, res_id in enumerate(res_ids): try: sys.stdout.write("article # " + str(count) + " processing...") start = time() target_paragraphs = Subresource.objects.filter( containing_resource_id=res_id) #create sentences from target_paragraphs sentences = [] sentences_indexofparagraph = [] tot_para = 0 tot_sent = 0 for para_num, target_paragraph in enumerate(target_paragraphs): #find all sentence in this paragraph tokenized_sentences = sent_tokenize( target_paragraph.content.rstrip()) sentences.extend(tokenized_sentences) sentences_indexofparagraph.extend([para_num] * len(tokenized_sentences)) tot_sent = tot_sent + len(tokenized_sentences) tot_para = tot_para + 1 tot_para = len(target_paragraphs) #second go through each concept/term, find them in subresources, and process into matrix tc = 0 j = 0 row_sentence = [] row_paragraph = [] col_sentence = [] col_paragraph = [] data_sentence = [] data_paragraph = [] # initialize list of empty lists for storing concepts contained in each paragraph para_conceptIDs_contained = [[] for i in range(tot_para)] for i, con_ID in enumerate(concept_IDs): term_list = term_lists[i] wordcount_in_paragraphs = [0] * tot_para terms_regex = [ r"\b" + re2.escape(term.lower()) + r"\b" for term in term_list ] search_pattern = re2.compile("|".join(terms_regex)) for sent_num, sentence in enumerate(sentences): wordcount = len(search_pattern.findall(sentence.lower())) if wordcount > 0: #only go ahead if search_pattern is in the sentence row_sentence.append(sent_num) col_sentence.append(tc) data_sentence.append(1) wordcount_in_paragraphs[ sentences_indexofparagraph[sent_num]] += wordcount for para_num in range(tot_para): wordcount_in_p = wordcount_in_paragraphs[para_num] if wordcount_in_p > 0: row_paragraph.append(para_num) col_paragraph.append(tc) data_paragraph.append(1) para_conceptIDs_contained[para_num].append(con_ID) if tc * 10 / tot_concepts > j: percent_done = tc * 10 / tot_concepts * 10 sys.stdout.write(str(percent_done) + "% ") j = j + 1 tc = tc + 1 # update concepts_contained fields for all subresource objects for para_num in range(tot_para): if len(para_conceptIDs_contained[para_num]) > 0: target_paragraphs[para_num].concepts_contained.add( *para_conceptIDs_contained[para_num]) #create target_A matrix target_A_sentence = coo_matrix( (array(data_sentence), (array(row_sentence), array(col_sentence))), shape=(tot_sent, tot_concepts), dtype=int16) #target_A_paragraph = coo_matrix((array(data_paragraph),(array(row_paragraph),array(col_paragraph))),shape=(tot_para,tot_concepts),dtype=int16) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # now convert target_A into a scipy csr_matrix (sparse matrix) target_A_sentence = target_A_sentence.tocsr() #target_A_paragraph = target_A_paragraph.tocsr() # calculate AtA for target_A AtA_sentence = target_A_sentence.T * target_A_sentence #AtA_paragraph = target_A_paragraph.T * target_A_paragraph # add AtA to Big_A if count == 0: bigA_AtA_sentence = AtA_sentence N_sentence = tot_sent #bigA_AtA_paragraph = AtA_paragraph #N_paragraph = tot_para else: bigA_AtA_sentence = bigA_AtA_sentence + AtA_sentence N_sentence = N_sentence + tot_sent #bigA_AtA_paragraph = bigA_AtA_paragraph + AtA_paragraph #N_paragraph = N_paragraph + tot_para sys.stdout.write(str(time() - start) + " seconds\n") sys.stdout.flush() except Exception, e: print "failed. exception: " + str(e) traceback.print_exc()
def test_escape(self, pattern, expected_escaped): escaped = re2.escape(pattern) self.assertEqual(expected_escaped, escaped)
#second go through each concept/term, find them in subresources, and process into matrix tc = 0 j = 0 row_sentence = [] row_paragraph = [] col_sentence = [] col_paragraph = [] data_sentence = [] data_paragraph = [] # initialize list of empty lists for storing concepts contained in each paragraph para_conceptIDs_contained = [[] for i in range(tot_para)] for i, con_ID in enumerate(concept_IDs): term_list = term_lists[i] wordcount_in_paragraphs = [0] * tot_para terms_regex = [r"\b"+re2.escape(term.lower())+r"\b" for term in term_list] search_pattern = re2.compile("|".join(terms_regex)) for sent_num, sentence in enumerate(sentences): wordcount = len(search_pattern.findall(sentence.lower())) if wordcount > 0: #only go ahead if search_pattern is in the sentence row_sentence.append(sent_num) col_sentence.append(tc) data_sentence.append(1) wordcount_in_paragraphs[sentences_indexofparagraph[sent_num]] += wordcount for para_num in range(tot_para): wordcount_in_p = wordcount_in_paragraphs[para_num] if wordcount_in_p > 0: row_paragraph.append(para_num) col_paragraph.append(tc) data_paragraph.append(1) para_conceptIDs_contained[para_num].append(con_ID)
def __init__(self, batch_size, n_cpus, n_threads, mode): print('loading model...', end=' ') self.nlp = english_model.load() self.nlp.remove_pipe('tagger') self.nlp.remove_pipe('ner') punct = list(string.punctuation) punct.remove('.') punct.append('[**') punct.append('**]') punct = [re.escape(p) for p in punct] prefixes_custom = tuple(punct) infixes_custom = tuple(punct) suffixes_custom = tuple(punct) #prefixes_custom = tuple([r'\[\*\*', r'(']) #suffixes_custom = tuple([r'\*\*\]', r')']) #infixes_custom = tuple([r'\[\*\*', r'\*\*\]', r'(', r')', r'>', r'<', r'->', r'-->', r'--->']) exceptions_custom = {id : pattern for id, pattern in tokenizer_utils.generate_matcher_pattern1()} exceptions = update_exc(self.nlp.Defaults.tokenizer_exceptions, exceptions_custom) prefix_re = compile_prefix_regex(self.nlp.Defaults.prefixes + prefixes_custom) infix_re = compile_infix_regex(infixes_custom + self.nlp.Defaults.infixes) suffix_re = compile_suffix_regex(self.nlp.Defaults.suffixes + suffixes_custom) tokenizer = SpacyTokenizer(self.nlp.vocab, rules=exceptions, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=self.nlp.Defaults.token_match) self.nlp.tokenizer = tokenizer matcher = Matcher(self.nlp.vocab) def on_match_pattern(matcher, doc, id, matches): match_id, start, end = matches[id] if self.nlp.vocab.strings[match_id].startswith('p3'): span = doc[start+1:end] span.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 1, matches[i][2] - 1) elif self.nlp.vocab.strings[match_id].startswith('p2.1'): span1 = doc[start:start+2] span2 = doc[start+2:end] span1.merge() span2.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 2, matches[i][2] - 2) elif self.nlp.vocab.strings[match_id].startswith('p2.2'): span2 = doc[start+1:end] span2.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 1, matches[i][2] - 1) elif self.nlp.vocab.strings[match_id].startswith('p2.3'): span1 = doc[start:start+2] span1.merge() for i in range(id, len(matches)): matches[i] = (matches[i][0], matches[i][1] - 1, matches[i][2] - 1) for id, pattern in tokenizer_utils.generate_matcher_pattern2(): matcher.add(id, on_match_pattern, pattern) for id, pattern in tokenizer_utils.generate_matcher_pattern3(): matcher.add(id, on_match_pattern, pattern) self.nlp.add_pipe(matcher, before='parser') print('done') self.batch_size = batch_size self.n_cpus = n_cpus self.n_threads = n_threads self.mode = mode