def _build_taggers(self): behind, ahead = self.looks.to_rex() self._taggers = {} for name, version_rexes in self.versions.iteritems(): self._taggers[name] = [] if not isinstance(version_rexes, list): version_rexes = [version_rexes] for rex in self.rexes: for version_rex in version_rexes: tagger = rex + self.separator + behind + version_rex + ahead if self.compile_rex: tagger = re.compile(tagger, self.flags) self._taggers[name].append(tagger)
def _build_taggers(self): """Build a list of regular expressions, each containing a group named "version"; store in self._taggers """ behind, ahead = self.looks.to_rex() self._taggers = [ rex + self.separator + behind + self.arbitrary_rex + ahead for rex in self.rexes ] if self.compile_rex: self._taggers = [ re.compile(rex, self.flags) for rex in self._taggers ]
def fetch_versions(): """Fetch version information from Wikipedia. :return: Dictionary mapping version numbers to lists of version labels """ resp = requests.get('http://en.wikipedia.org/wiki/MATLAB') parsed = BeautifulSoup(resp.content) history_headline = parsed.find(id='Release_history') history_table = history_headline.find_next( 'table', class_=re.compile(r'wikitable'), ) history_rows = history_table.find_all('tr') return dict( filter( lambda item: item is not None, (parse_version_row(row) for row in history_rows), ) )
) spgr = RexTagger( 'spgr', [ r'\Wspgr\W', ] ) ### Trajectories ### epi = RexTagger( 'epi', [ r'echo{dlm}planar'.format(dlm=delimiter), re.compile(r'EPI'), ] ) spiral = RexTagger( 'spiral', [ r'spiral{dlm}in'.format(dlm=delimiter), r'spiral{dlm}out'.format(dlm=delimiter), ] ) spiral_context = MultiRexTagger( 'spiral', [ r'spiral',
] ) realign = RexTagger( 'realign', [ r'realign', r'{mov}{dlm}correct'.format( mov=mov_ptn, dlm=delimiter ), r'(motion|movement|translation|rotation){dlm}parameter'.format( dlm=delimiter, ), r'automat(ed|ic){dlm}image{dlm}registration'.format(dlm=delimiter), re.compile('\WAIR\W'), r'{cor}{dlm}(for)?{dlm}(bulk|whole|participants?|subjects?)?{dlm}(head)?{dlm}{mov}'.format( cor=cor_ptn, dlm=delimiter, mov=mov_ptn, ), r'{cor}{dlm}(for)?.{{,25}}and{dlm}(bulk|whole|participants?|subjects?)?{dlm}(head)?{dlm}{mov}'.format( cor=cor_ptn, dlm=delimiter, mov=mov_ptn, ), r'mcflirt', r'3dvolreg', ] )
r'small{dlm}volume{dlm}correction'.format(dlm=delimiter), r'\Wsvc\W', ]) # Tag AlphaSim separately from general Monte Carlo methods to identify # incorrect use when applied without smoothness estimation alphasim = RexTagger('alphasim', [ r'alpha{dlm}sim'.format(dlm=delimiter), r'clustsim', ]) alphasim_context = MultiRexTagger('alphasim', [ r'monte{dlm}carlo'.format(dlm=delimiter), ], [ r'rest{dlm}fmri'.format(dlm=delimiter), re.compile(r'AFNI'), re.compile(r'REST'), ]) monte = RexTagger('monte', [ r'alpha{dlm}sim'.format(dlm=delimiter), r'clustsim', r'monte{dlm}carlo{dlm}correct'.format(dlm=delimiter), ]) monte_context = MultiRexTagger('monte', [ r'monte{dlm}carlo'.format(dlm=delimiter), ], [ r'multiple{dlm}comparison'.format(dlm=delimiter), r'rest{dlm}fmri'.format(dlm=delimiter), r'threshold',
rest = RexTagger( 'rest', [ r'resting{dlm}state{dlm}fmri{dlm}data{dlm}analysis{dlm}toolkit'.format(dlm=delimiter), r'rest{dlm}(by)?{dlm}song{dlm}xiao'.format(dlm=delimiter), r'resting\-fmri\.sourceforge\.net', r'sourceforge\.net/projects/resting\-fmri', r'restfmri\.net', ] ) aal = RexTagger( 'aal', [ re.compile(r'\WAAL\W'), r'automatic{dlm}anatomic(al)?{dlm}label'.format(dlm=delimiter), ] ) snpm = RexTagger( 'snpm', [ r'\Wsnpm\W', r'statistical{dlm}non{dlm}parametric{dlm}mapping'.format(dlm=delimiter), ] ) spmd = RexTagger( 'spmd', [
import functools from neurotrends.config import re REX_TYPE = type(re.compile('')) DEF_FLAGS = re.IGNORECASE | re.VERBOSE NCHAR = 50 UNKNOWN_VERSION = '?' def rex_wrap(rex, wrap=''): return ur'({wrap}{rex})'.format(rex=rex, wrap=wrap) def rex_named(rex, name): return rex_wrap(rex, '?P<{}>'.format(name)) rex_noncap = functools.partial(rex_wrap, wrap='?:') rex_posbehind = functools.partial(rex_wrap, wrap='?<=') rex_negbehind = functools.partial(rex_wrap, wrap='?<!') rex_posahead = functools.partial(rex_wrap, wrap='?=') rex_negahead = functools.partial(rex_wrap, wrap='?!') # TODO: Only keep earliest tags def uextend(base, vals): for val in vals: if val not in base: base.append(val)
### Sequences ### mprage = RexTagger('mprage', [ r'\Wmp{dlm}rage\W'.format(dlm=delimiter), ]) spgr = RexTagger('spgr', [ r'\Wspgr\W', ]) ### Trajectories ### epi = RexTagger('epi', [ r'echo{dlm}planar'.format(dlm=delimiter), re.compile(r'EPI'), ]) spiral = RexTagger('spiral', [ r'spiral{dlm}in'.format(dlm=delimiter), r'spiral{dlm}out'.format(dlm=delimiter), ]) spiral_context = MultiRexTagger('spiral', [ r'spiral', ], [ r'mri', r'bold', r'imag', r'scan', r'data',
from neurotrends.config import re from neurotrends.tagger import RexTagger, MultiRexTagger from ..misc import delimiter ge = RexTagger( 'ge', [ r'general{dlm}electric'.format(dlm=delimiter), ] ) # Use MultiRexTagger for GE to disambiguate names: e.g. GE Smith ge_context = MultiRexTagger( 'ge', [ re.compile(r'\bGE\b'), ], [ r'mri', r'scan', r'tesla', ] ) siemens = RexTagger( 'siemens', [r'siemens'] ) philips = RexTagger( 'philips',
surfrend = RexTagger('surfrend', [ r'\Wsurf{dlm}rend'.format(dlm=delimiter), ]) rest = RexTagger('rest', [ r'resting{dlm}state{dlm}fmri{dlm}data{dlm}analysis{dlm}toolkit'.format( dlm=delimiter), r'rest{dlm}(by)?{dlm}song{dlm}xiao'.format(dlm=delimiter), r'resting\-fmri\.sourceforge\.net', r'sourceforge\.net/projects/resting\-fmri', r'restfmri\.net', ]) aal = RexTagger('aal', [ re.compile(r'\WAAL\W'), r'automatic{dlm}anatomic(al)?{dlm}label'.format(dlm=delimiter), ]) snpm = RexTagger('snpm', [ r'\Wsnpm\W', r'statistical{dlm}non{dlm}parametric{dlm}mapping'.format(dlm=delimiter), ]) spmd = RexTagger('spmd', [ r'\Wspmd\W', r''' statistical{dlm}parametric{dlm}mapping{dlm}diagnosis '''.format(dlm=delimiter), ])
def rex_compile(rex, flags=DEF_FLAGS): if isinstance(rex, REX_TYPE): return rex return re.compile(rex, flags=flags)
r'fiasco', r''' functional{dlm}imaging{dlm} analysis{dlm}software' '''.format(dlm=delimiter), ], ) fidl = RexTagger( 'fidl', [ r'fidl', ], ) fiswidgets = RexTagger( 'fiswidgets', [ r'fiswidgets', ], ) itk = RexTagger( 'itk', [ re.compile(r'\bITK\b'), r'itk\.org', r'insight{dlm}tool{dlm}kit'.format(dlm=delimiter), ], )
], ) tmpdrv_context = MultiRexTagger( "tmpdrv", [r"(hrf|hdr|ha?emodynamic{dlm}response)".format(dlm=delimiter)], [r"first{dlm}derivative".format(dlm=delimiter)], separator="[^.,:;?]*", ) dspdrv = RexTagger("dspdrv", [r"dispersion{dlm}derivative".format(dlm=delimiter)]) fir = RexTagger( "fir", [ re.compile(r"\WFIR\W"), r"\Wfir{dlm}\)?(basis|set)".format(dlm=delimiter), r"finite{dlm}impulse{dlm}response".format(dlm=delimiter), ], ) fir_context = MultiRexTagger( "fir", [ r"not?{dlm}assum".format(dlm=delimiter), r"not{dlm}make{dlm}(any)?{dlm}assum".format(dlm=delimiter), r"ma[dk](es?|ing)?{dlm}no{dlm}assum".format(dlm=delimiter), ], [ hrf_ptn, r"response{dlm}shape".format(dlm=delimiter),
alphasim = RexTagger( 'alphasim', [ r'alpha{dlm}sim'.format(dlm=delimiter), r'clustsim', ] ) alphasim_context = MultiRexTagger( 'alphasim', [ r'monte{dlm}carlo'.format(dlm=delimiter), ], [ r'rest{dlm}fmri'.format(dlm=delimiter), re.compile(r'AFNI'), re.compile(r'REST'), ] ) monte = RexTagger( 'monte', [ r'alpha{dlm}sim'.format(dlm=delimiter), r'clustsim', r'monte{dlm}carlo{dlm}correct'.format(dlm=delimiter), ] ) monte_context = MultiRexTagger( 'monte',
r'event{dlm}related{dlm}tatistic'.format(dlm=delimiter), r'event{dlm}related{dlm}functional{dlm}mri'.format(dlm=delimiter), r'event{dlm}related{dlm}functional{dlm}magnetic'.format(dlm=delimiter), r'\Wer{dlm}fmri\W'.format(dlm=delimiter), r'single{dlm}event{dlm}design'.format(dlm=delimiter), r'event{dlm}of{dlm}interest'.format(dlm=delimiter), r'stick{dlm}function'.format(dlm=delimiter), ], rexes_negative=[ r'\Wpp\.{dlm}\d+'.format(dlm=delimiter), r'\d+[\s-]+\d+', r'neuroimage', r'neuron', r'proceedings', r'transactions', re.compile(r'Trans[^a-zA-Z]'), re.compile(r'Research'), r'pnas', r'biol(ogical)?{dlm}psych'.format(dlm=delimiter), r'j(ournal{dlm}of)?{dlm}neurosci'.format(dlm=delimiter), r'arch(ives{dlm}of)?{dlm}gen'.format(dlm=delimiter), r'brain{dlm}cogn'.format(dlm=delimiter), r'journal', r'plos', r'frontiers', ] ) block = RexTagger( 'block', [
}, 'pdf': { 'class': PDFDocument, 'field': 'publisher_pdf', }, } # Some PubMed records encode dates in unhelpful formats like "2006 May-Aug" # or "1999 Jan 1-15". This pattern extracts the useful part of these strings # ("2006 May", "1999 Jan") and discards the rest. month_range_pattern = re.compile( r''' ( \d{4} \s [a-z]{3} ) [\s\-] .* ''', re.I | re.X, ) # TODO: Test me def parse_publication_date(date_text): date_text = re.sub(month_range_pattern, '\\1', date_text) try: return dateparser.parse(date_text) except (TypeError, ValueError, AttributeError): return None
'time{dlm}(and{dlm}dispersion)?{dlm}derivative'.format(dlm=delimiter), ]) tmpdrv_context = MultiRexTagger('tmpdrv', [ r'(hrf|hdr|ha?emodynamic{dlm}response)'.format(dlm=delimiter), ], [ r'first{dlm}derivative'.format(dlm=delimiter), ], separator='[^.,:;?]*') dspdrv = RexTagger('dspdrv', [ r'dispersion{dlm}derivative'.format(dlm=delimiter), ]) fir = RexTagger('fir', [ re.compile(r'\WFIR\W'), r'\Wfir{dlm}\)?(basis|set)'.format(dlm=delimiter), r'finite{dlm}impulse{dlm}response'.format(dlm=delimiter), ]) fir_context = MultiRexTagger('fir', [ r'not?{dlm}assum'.format(dlm=delimiter), r'not{dlm}make{dlm}(any)?{dlm}assum'.format(dlm=delimiter), r'ma[dk](es?|ing)?{dlm}no{dlm}assum'.format(dlm=delimiter), ], [ hrf_ptn, r'response{dlm}shape'.format(dlm=delimiter), r'shape{dlm}of{dlm}(the)?{dlm}response'.format(dlm=delimiter), r'time{dlm}cource'.format(dlm=delimiter), ], separator='[^.,:;?]*')
def get_matlab_versions(overwrite=False): """Get MATLAB versions from Wikipedia. :param overwrite: Overwrite existing data? :return: MATLAB versions """ # Get version file version_file = os.path.join(trendpath.data_dir, 'matlab-versions.shelf') # Used saved versions if version file exists and not overwrite if os.path.exists(version_file) and not overwrite: shelf = shelve.open(version_file) versions = shelf['versions'] shelf.close() return versions # Open Wikipedia page response = requests.get('http://en.wikipedia.org/wiki/MATLAB') soup = BeautifulSoup(response.content) # Find "Release History" table history_headline = soup.find(id='Release_history') history_table = history_headline.find_next( 'table', class_=re.compile(r'wikitable'), ) history_row = history_table.find_all('tr') # Initialize Matlab versions versions = {} for row in history_row[1:]: # Get <td> elements tds = row.findAll('td') # Get version number version_number = tds[0].text version_number = re.sub(r'matlab\s+', '', version_number, flags=re.I) # Get version name version_name = tds[1].text # Make "r" in e.g. "r2007a" optional version_name = re.sub('r', 'r?', version_name, flags=re.I) # "Service Pack" -> "sp" version_name = re.sub( r'{dlm}(sp|service pack){dlm}'.format(dlm=delimiter), 'sp', version_name, flags=re.I ) # Add to versions versions[version_number] = [version_number] if version_name: versions[version_number].append(version_name) # Save results to version file shelf = shelve.open(version_file) shelf['versions'] = versions shelf.close() # Return versions return versions
# -*- coding: utf-8 -*- from neurotrends.config import re from neurotrends.tagger import RexTagger, MultiRexTagger from ..misc import delimiter ge = RexTagger('ge', [ r'general{dlm}electric'.format(dlm=delimiter), ]) # Use MultiRexTagger for GE to disambiguate names: e.g. GE Smith ge_context = MultiRexTagger('ge', [ re.compile(r'\bGE\b'), ], [ r'mri', r'scan', r'tesla', ]) siemens = RexTagger('siemens', [r'siemens']) philips = RexTagger('philips', [r'philips']) bruker = RexTagger('bruker', [r'bruker']) varian = RexTagger('varian', [r'varian\W']) shimazdu = RexTagger('shimadzu', [r'shimazdu']) marconi = RexTagger('marconi', [r'marconi'])
from neurotrends.tagger import RexTagger, MultiRexTagger from misc import delimiter os_secondary_ptn = [ r'\Wpc\W', r'\Wos\W', r'operating', r'platform', r'environment', r'workstation', ] mac = RexTagger( 'mac', [ re.compile(r'OS[-/\s]*X'), r'\Wmac{dlm}os\W'.format(dlm=delimiter), r'spss{dlm}for{dlm}mac'.format(dlm=delimiter), ] ) mac_context = MultiRexTagger( 'mac', [ r'apple', r'mac(intosh)?', ], os_secondary_ptn, separator='[^.,:;?]*' )
'pdf': { 'class': PDFDocument, 'field': 'publisher_pdf', }, } # Some PubMed records encode dates in unhelpful formats like "2006 May-Aug" # or "1999 Jan 1-15". This pattern extracts the useful part of these strings # ("2006 May", "1999 Jan") and discards the rest. month_range_pattern = re.compile( r''' ( \d{4} \s [a-z]{3} ) [\s\-] .* ''', re.I | re.X, ) # TODO: Test me def parse_publication_date(date_text): date_text = re.sub(month_range_pattern, '\\1', date_text) try: return dateparser.parse(date_text) except (TypeError, ValueError, AttributeError): return None
category = 'analysis' from neurotrends.config import re from neurotrends.tagger import RexTagger, MultiRexTagger from misc import delimiter kda = RexTagger( 'kda', [ re.compile(r'\WKDA\W'), ] ) kda_context = MultiRexTagger( 'kda', [ r'kernel{dlm}density{dlm}analysis'.format(dlm=delimiter), ], [ r'multi{dlm}level'.format(dlm=delimiter), ], separator='[^.,:;?]*' ) mkda = RexTagger( 'mkda', [ r''' multi{dlm}level{dlm}kernel{dlm}density{dlm}analysis '''.format(dlm=delimiter), r'multi{dlm}level{dlm}kda'.format(dlm=delimiter),