Exemplo n.º 1
0
        def acroize_heading(m):
            acro = text.get('acronym')
            if not acro:
                return m[0]
            heading = m[2]
            if not heading:
                return acro
            m2 = regex.match(r'(\d+(?:–(\d+))?)(?:\.)?\s*(.*)$', heading)
            if not m2:
                h_text = heading
            else:
                h_num = m2[1]
                h_text = m2[3]

                m3 = regex.match(r'(.*?)(\d+(?:–(\d+))?)$', text['acronym'])
                acro_prefix = m3[1]
                acro_num = m3[2]

                if acro_num == h_num:
                    heading = h_text
                elif '–' in acro_num and h_num:
                    acro = acro_prefix + h_num
                    heading = h_text
                
                

            new_heading = f'<span class="acro">{acro}</span>{": " if h_text else ""}{h_text}'
            return f'{m[1]}{new_heading}'
Exemplo n.º 2
0
def test_yaml(md_filepath):
    filestring = md_filepath.read()
    reg = regex.compile(r'^---(.*?)---',flags=regex.DOTALL)
    match = regex.search(reg, filestring)

    if not match: pytest.skip('No YAML header')

    yaml_text = match.group(1)
    parsed_yaml = yaml.load(yaml_text)
    for requirement in requirements:
        req = requirements[requirement]
        if req['required']:
            assert requirement in parsed_yaml, 'YAML metadata missing required element: ' + requirement
        if req['type'] == 'link':
            # Check external links have balanced brackets
            regexp = regex.compile(r'\[(.*)\]\((.*)\)')
            assert regex.match(regexp,parsed_yaml[requirement]), 'YAML metadata formatting error: ' + requirement
        if req['type'] == 'date' and requirement in parsed_yaml:
            try:
                d = parse(str(parsed_yaml[requirement]))
            except ValueError:
                assert False, 'YAML metadata formatting error: ' + requirement + ' date parse failed.'
            regexp = regex.compile(r'20[0-9]{2}-[0-9]{2}-[0-9]{2}')
            assert regex.match(regexp,str(parsed_yaml[requirement])), 'YAML metadata formatting error: ' + requirement + ' should use the format YYYY-MM-DD.'

    for header in parsed_yaml:
        assert header in requirements, 'YAML metadata header ' + header + ' is not a valid metadata type.' 
Exemplo n.º 3
0
def get_next_document(h):
  while True:
    l = h.readline()
    if not l:
      doc = None
      break
    l = l.decode('utf-8').strip()
    if not l:
      continue
  
    if re.match(u'^<doc ', l, re.UNICODE):

      # Fix _unk_.
      l = re.sub(r'_unk_', 'unknown', l)

      # Forum detection.
      if re.match(RE_FORUM, l, re.UNICODE):
        l = re.sub(u'>$', r' forum="1">', l, re.UNICODE)
      else:
        l = re.sub(u'>$', r' forum="0">', l, re.UNICODE)

      # Host and tld extraction.
      l = re.sub(r'( url="https{0,1}://)([^/]+)\.([a-z]{2,4})(|/|%)([^"]*")', r'\1\2.\3\4\5 urldomain="\2.\3" tld="\3"', l)

      # Fix some known problems in doc attr values.
      l = re.sub(r'=" +"', r'="unknown"', l)          # fix: attr=" "
      l = re.sub(r'="([^"]+)\\" ', r'="\1" ', l)   # fix: attr="val\"

      doc = [l]
    else:
      doc = doc + [l]
      if re.match(u'^</doc>', l, re.UNICODE):
        break
  return doc
Exemplo n.º 4
0
def process_lines(lines, NONBREAKING_PREFIX):
    # loop text, add lines together until we get a blank line or a <p>
    out_text = ''

    text = ""
    for line in lines:
        line = line.strip()
        m = re_tag.match(line)
        if m is None:
            m = regex.match('^\s*$', line)

        if m is not None:
            # time to process this block, we've hit a blank or <p>
            out_text += do_it_for(text, line, NONBREAKING_PREFIX)
            if regex.match('^\s*$', line) and len(text): ##if we have text followed by <P>
                out_text += "<P>\n"
                text = ""
        else:
            # append the text, with a space
            text += line + " "

    # do the leftover text
    if len(text):
        out_text += do_it_for(text, "", NONBREAKING_PREFIX)
    return out_text
Exemplo n.º 5
0
Arquivo: server.py Projeto: ufal/ker
def process_file(file_path, tagger, idf_doc_count, idf_table, threshold, maximum_words):
    """
    Takes the uploaded file, detecs its type (plain text, alto XML, zip)
    and calls a parsing function accordingly. If everything succeeds it
    returns keywords and 200 code, returns an error otherwise.
    """
    file_info = magic.from_file(file_path)
    lines = []
    if re.match("^UTF-8 Unicode (with BOM) text", file_info):
        lines = lines_from_txt_file(file_path, encoding='utf-8-sig')
    elif re.match("^UTF-8 Unicode", file_info):
        lines = lines_from_txt_file(file_path, encoding='utf-8')
    elif re.match("^ASCII text", file_info):
        lines = lines_from_txt_file(file_path, encoding='utf-8')
    elif re.match('^XML 1.0 document', file_info) and \
            (file_path.endswith('.alto') or file_path.endswith('.xml')):
        lines = lines_from_alto_file(file_path)
    elif re.match('^Zip archive data', file_info):
        lines = lines_from_zip_file(file_path)
    else:
        return {"eror": "Unsupported file type: {}".format(file_info)}, 400

    if not lines:
        return {"error": "Empty file"}, 400
    return keywords.get_keywords(lines, tagger, idf_doc_count, idf_table, threshold, maximum_words), 200
Exemplo n.º 6
0
def guess_split(majiribun, reading):
    kanjis=[]
    matchreg_greedy=''
    matchreg_nongreedy=''
    for char in majiribun:
        if kanji_re.match(char):
            kanjis.append(char)
            matchreg_greedy += "(\p{Hiragana}+)"
            matchreg_nongreedy += "(\p{Hiragana}+?)"
        else:
            matchreg_greedy += re.escape(char)
            matchreg_nongreedy += re.escape(char)

    m = re.match(matchreg_greedy + '$', reading)
    if m:
        yomis = m.groups()

        yomis_nongreedy = re.match(matchreg_nongreedy + '$', reading).groups()
        if yomis != yomis_nongreedy:
            # Ambiguous!
            return None
        d = {}
        for idx in range(0, len(kanjis)):
            d[kanjis[idx]] = yomis[idx]
        return(d)
    def testRegex(self):

        # Basic match, beginning of string
        self.assertEqual(1, match("foo", "foobar"))

        # Basic match, middle of string
        self.assertEqual(1, match("oba", "foobar"))

        # Basic match, no match
        self.assertEqual(0, match("obo", "foobar"))

        # Match with start qualifier
        self.assertEqual(1, match("^fo", "foobar"))

        # Match with start qualifier in body
        self.assertEqual(0, match("^bar", "foobar"))

        # Match with end qualifier
        self.assertEqual(1, match("bar$", "foobar"))

        # Match with end qualifier in body
        self.assertEqual(0, match("foo$", "foobar"))

        # Match with optional qualifier
        self.assertEqual(1, match("fo*b", "foobar"))

        # Match with optional qualifier 2
        self.assertEqual(1, match("fooa*b", "foobar"))

        # Match with optional qualifier 3
        self.assertEqual(1, match("a*foo", "foobar"))
Exemplo n.º 8
0
 def __init__(self, room, s, negative=True):
  """
  парсит выражения типа '/5m jid blabla@server', 'nick exp regexp', etc.
  короче в стиле глюкса
  """
  self.room = room
  self.negative = negative
  self.end_time, s = fetch_time(s)
  if s.count('||'): s, self.reason = s[:s.find('||')].strip(), s[s.find('||')+2:].strip()
  else: s, self.reason = s.strip(), ''
  if s.lower().startswith('jid '):
   self.by_jid = True
   s = s[4:].lower()
   if not s: raise ValueError
  elif s.lower().startswith('nick '):
   self.by_jid = False
   s = s[5:]
   if not s: raise ValueError
  else:
   self.by_jid = True
   self.regexp = False
   item = room.get(s, None)
   if item:
    if item.jid == item.realjid: raise NoJID(item.jid)
    else: self.value = item.realjid.lower()
   else: raise NickNotFound(s)
   return
  if s.lower().startswith('exp '):
   self.regexp = True
   s = s[4:]
   try: regex.match(s, '*****@*****.**')
   except: raise MyRegexpError(s)
  else: self.regexp = False
  self.value = s
Exemplo n.º 9
0
def parseaddr(address):
	# This is probably not perfect
	address = string.strip(address)
	# Case 1: part of the address is in <xx@xx> form.
	pos = regex.search('<.*>', address)
	if pos >= 0:
		name = address[:pos]
		address = address[pos:]
		length = regex.match('<.*>', address)
		name = name + address[length:]
		address = address[:length]
	else:
		# Case 2: part of the address is in (comment) form
		pos = regex.search('(.*)', address)
		if pos >= 0:
			name = address[pos:]
			address = address[:pos]
			length = regex.match('(.*)', name)
			address = address + name[length:]
			name = name[:length]
		else:
			# Case 3: neither. Only an address
			name = ''
	name = string.strip(name)
	address = string.strip(address)
	if address and address[0] == '<' and address[-1] == '>':
		address = address[1:-1]
	if name and name[0] == '(' and name[-1] == ')':
		name = name[1:-1]
	return name, address
Exemplo n.º 10
0
def faiordict2contigorder(file_name, file_format):
    '''Takes either a .fai or .dict file, and return a contig order dictionary, i.e., chrom_seq['chr1'] == 0'''

    assert file_format in ('fai', 'dict')

    contig_sequence = []
    with open(file_name) as gfile:
        line_i = gfile.readline().rstrip('\n')

        while line_i:

            if file_format == 'fai':
                contig_match = re.match(r'([^\t]+)\t', line_i)

            elif file_format == 'dict':
                if line_i.startswith('@SQ'):
                    contig_match = re.match(r'@SQ\tSN:([^\t]+)\tLN:', line_i)

            if contig_match:
                contig_i = contig_match.groups()[0].split(' ')[0]
                # some .fai files have space after the contig for descriptions.
                contig_sequence.append( contig_i )

            line_i = gfile.readline().rstrip('\n')

    chrom_seq = {}
    for n,contig_i in enumerate(contig_sequence):
        chrom_seq[contig_i] = n

    return chrom_seq
Exemplo n.º 11
0
 def __init__(self, text, lv=None, lc=None, vc=None):
     if isinstance(text, Ex):
         self._text = text.text
         self._lc = text.lc
         self._vc = text.vc
     elif not isinstance(text, str):
         raise TypeError("text must be string")
     else:
         self._text = text
         if lv:
             if re.match(r'^[a-z]{3}-[0-9]{3,}$', lv):
                 self._lc = lv[:3]
                 self._vc = int(lv[4:])
             else: raise ValueError("lv must be in the format xxx-000")
         elif lc and (vc != None):
             lc = lc.lower()
             if re.match(r'^[a-z]{3}$', lc):
                 self._lc = lc
             else: raise ValueError("lc must be a 3-letter ISO 639 code")
             try:
                 vc = int(vc)
                 if vc < 0: raise ValueError("vc must be a positive integer")
                 self._vc = vc
             except ValueError: raise ValueError("vc must be a positive integer")
         else:
             raise TypeError("{cls} requires lv".format(cls=self.__class__.__name__))
Exemplo n.º 12
0
def create_activation(data, labels, standard_cols, group_labels=[]):

    activation = database.Activation()

    for i, col in enumerate(data):

        # Cast to integer or float if appropriate
        # if regex.match('[-\d]+$', col):
        #     col = int(col)
        # elif regex.match('[-\d\.]+$', col):
        #     col = float(col)

        # Set standard attributes if applicable and do validation where appropriate.
        # Generally, validation will not prevent a bad value from making it into the
        # activation object, but it will flag any potential issues using the "problem" column.
        if standard_cols[i] is not None:

            sc = standard_cols[i]

            # Validate XYZ columns: Should only be integers (and possible trailing decimals).
            # If they're not, keep only leading numbers. The exception is that ScienceDirect 
            # journals often follow the minus sign with a space (e.g., - 35), which we strip.
            if regex.match('[xyz]$', sc):
                m = regex.match('(-)\s+(\d+\.*\d*)$', col)
                if m:
                    col = "%s%s" % (m.group(1), m.group(2))
                if not regex.match('(-*\d+)\.*\d*$', col):
                    logging.debug("Value %s in %s column is not valid" % (col, sc))
                    activation.problems.append("Value in %s column is not valid" % sc)
                    # col = regex.search('(-*\d+)', col).group(1)
                    return activation
                col = (float(col))

            elif sc == 'region':
                if not regex.search('[a-zA-Z]', col):
                    logging.debug("Value in region column is not a string")
                    activation.problems.append("Value in region column is not a string")

            setattr(activation, sc, col)


        # Always include all columns in record
        activation.add_col(labels[i], col)
      
        # Handle columns with multiple coordinates (e.g., 45;12;-12).
        # Assume that any series of 3 numbers in a non-standard column
        # reflects coordinates. Will fail if there are leading numbers!!!
        # Also need to remove space between minus sign and numbers; some ScienceDirect
        # journals leave a gap.
        if not i in standard_cols:
            cs = '([\-\.\s]*\d{1,3})'
            m = regex.search('%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), unicode(col).strip())
            if m:
                x, y, z = [regex.sub('-\s+', '-', c) for c in [m.group(1), m.group(2), m.group(3)]]
                logger.info("Found multi-coordinate column: %s\n...and extracted: %s, %s, %s" % (col, x, y, z))
                activation.set_coords(x, y, z)

    activation.groups = group_labels
    return activation
Exemplo n.º 13
0
def parse_line(line, perv_url):
    if not line or len(line.strip()) == 0:
        raise ValueError("STR_EMPTY")
    line = line.strip()
    spt = line.split('-')
    if len(spt) == 3:
        name_1 = spt[0]
        name_2 = spt[1]
        attrs = spt[2]
        attrs_spt = attrs.split(',')

        if not (len(attrs_spt) == 2 or (len(attrs_spt) == 1 and perv_url)):
            raise ValueError("STR_ENTRY_EMPTY")

        if not name_1 \
            or not name_2 \
            or not regex.match("^["+_cryllic+"\s]+$", name_1)\
            or not regex.match("^["+_cryllic+"\s]+$", name_2)\
            or len(name_1.split(' ')) != 2\
            or len(name_2.split(' ')) != 2:
                raise ValueError("STR_NAME_FORMAT")

        if name_1 == name_2:
            raise ValueError("STR_SAME_NAMES")

        if len(attrs_spt) == 2 and perv_url:
            raise ValueError("STR_TAG_FORMAT")
        if not regex.match("^(?!\.)["+_cryllic+"\.]+(?<!\.)$", attrs_spt[0]):
            raise ValueError("STR_TAG_FORMAT")

        link_types = attrs_spt[0].split('.')

        if filter(lambda x: not x, link_types):
            raise ValueError("STR_TAG_FORMAT")

        arr = collections.Counter(link_types)
        doubled_tags = set(i for i in arr if arr[i]>1)
        if len(doubled_tags) != 0:
            raise ValueError("STR_TAG_DOUBLED:" + ",".join(doubled_tags))

        url = attrs_spt[1] if len(attrs_spt) == 2 else perv_url

        if not regex.match("http://[\w\.]+/[\w]+$", url):
            raise ValueError("STR_LINK_FORMAT")
        """
        sim_names = list(es.get_similar_names([name_1, name_2]))
        if isinstance(sim_names[0], basestring):
            raise ValueError(u"STR_SIMILAR_NAME:{},{}".format(name_1,sim_names[0]))
        if isinstance(sim_names[1], basestring):
            raise ValueError(u"STR_SIMILAR_NAME:{},{}".format(name_2,sim_names[1]))

        tags = filter(lambda x: not x[1], zip(link_types, es.check_tags(link_types)))
        if len(tags) != 0:
            raise ValueError(u"STR_TAG_NOT_FOUND:{}".format(",".join(map(lambda x: x[0], tags))))
        """

        return (name_1, name_2, link_types, url)
    else:
        raise ValueError("STR_FORMAT")
Exemplo n.º 14
0
 def test_zero_or_one(self):
     p = regex.build_regex("ba?")
     result = regex.match(p, "ba")
     self.assertTrue(result)
     result = regex.match(p, "b")
     self.assertTrue(result)
     result = regex.match(p, "aa")
     self.assertFalse(result)
Exemplo n.º 15
0
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
    """Run regexes against message's marked lines to strip quotations.

    Return all but the last quoted segment if it exists.
    >>> mark_message_lines(['Hello', 'From: [email protected]', '', '> Hi', 'tsem'])
    ['Hello']

    Also returns return_flags.
    return_flags = [were_lines_deleted, first_deleted_line,
                    last_deleted_line]
    """
    # Pre-process marker sequence

    # if there are no splitter there should be no markers. However, allow markers if more than 3!
    if 's' not in markers and not re.search('(me*){3}', markers):
        markers = markers.replace('m', 't')

    # Look for forwards (don't remove anything on a forward)

    # if there is an f before the first split, then it's a forward.
    if re.match('[te]*f', markers):
        return_flags[:] = [False, -1, -1]
        return lines

    # Remove last quoted segment

    # match from the end of the markers list
    markers.reverse()

    # match for unmarked quote following split
    quotation = re.match(r'e*(te*)+(se*)+', markers)
    if not quotation:

        # match for inline replies
        if re_orig.match(r'e*[mfts]*((te*)+(me*)+)+[mfts]*((se*)+|(me*){2,})', markers):
            return_flags[:] = [False, -1, -1]
            return lines 

        # match for normal reply with quote
        quotation = re_orig.match(r'e*(me*)+[mefts]*((se*)+|(me*){2,})', markers)

    if not quotation:
        # match for normal reply with quote and signature below quote
        if re.match(r'e*(te*)+(me*)+.*(s)+e*(te*)+', markers):
            quotation = re.match(r'e*(te*)+(me*)+.*(s)+', markers)

    markers.reverse()

    # If quotation, return it
    if quotation:
        start = len(markers) - quotation.end() + 1
        end = len(markers) - quotation.start() - 1
        return_flags[:] = True, start, end
        return lines[:start] + lines[end:]


    return_flags[:] = [False, -1, -1]
    return lines
Exemplo n.º 16
0
def printDiseaseClass(outStream,line):
    outStream.write('<%s/%s> \n\ta  %s;\n' % (Namespace, line[0], visumpointGene.VP_PharmGKBDisease))
    outStream.write('\trdfs:label "%s"^^xsd:string ;\n' % (visumpointGene.strip(line[1])))
    outStream.write('\tskos:prefLabel "%s"^^xsd:string ;\n' % (visumpointGene.strip(line[1])))
    trip = ""
    useName = ""
    if len(line[2]) > 0 :
        for name in line[2].split(','):
            if name.startswith("\"") and name.endswith("\""):
                useName = name
            elif name.startswith("\""):
                trip = name
            elif name.endswith("\""):
                useName = (trip + "," +name).strip("\"")
                trip = ""
            else :
                useName = name
            if len(useName) > 0:
                visumpointGene.printAlternativeName(outStream, "", visumpointGene.strip(useName), visumpointGene.VP_PharmGKB,"\t\t")
                useName = ""

    db_id_type_strB = "(.+):(.+)\((.+)\[(.+)/(.+)\]"
    db_id_PP = r'(.+):(.+)\(([^()]++)\((.*)\)+'
    strExp = "(.+):(.+)\((.+)"
    if len(line[4]) > 0 :
        for name in line[4].split('),'):
                #db_id_type_strB = "(.+):(.+)\((.+)\[(.+)/(.+)\]"
                try:
                    match = regex.match(db_id_type_strB, name, regex.M|regex.I)
                    if (match):
                        visumpointGene.printCrossReference(outStream, dbReference(match.group(1)), visumpointGene.strip(match.group(2)), visumpointGene.VP_PharmGKB,"\t\t")
                    else :
                        #db_id_PP = r'(.+):(.+)\(([^()]++)\((.*)\)+'
                        try:
                            match1 = regex.match( db_id_PP, name, re.M|re.I)
                            if (match1):
                                visumpointGene.printCrossReference(outStream, dbReference(match1.group(1)), visumpointGene.strip(match1.group(2)), visumpointGene.VP_PharmGKB,"\t\t")
                            else :
                                #strExp = "(.+):(.+)\((.+)"
                                try:
                                    match2 = regex.match( strExp, name, re.M|re.I)
                                    if(match2):
                                        visumpointGene.printCrossReference(outStream, dbReference(match2.group(1)), visumpointGene.strip(match2.group(2)), visumpointGene.VP_PharmGKB,"\t\t")
                                    else :
                                        outStream.write("#ERROR : processing External References : %s\n" % name)
                                        outStream.write("#\t\t %s\n" % line[4])
                                except:
                                    e = sys.exc_info()[0]
                                    print( "Error: %s" % e )
                        except:
                            e = sys.exc_info()[0]
                            print( "Error: %s" % e )
                except:
                    e = sys.exc_info()[0]
                    print( "Error: %s" % e )

    outStream.write('\t. # %s %s\n\n' % (line[1], line[0]))
Exemplo n.º 17
0
 def replacement(m):
     # Ignore 'also cf. ix p. 393ff' and ' A'
     first, second = m[1], m[2]
     if regex.match(r".+p\.\s*", first) or regex.match(r".+A", first):
         return "{}{}".format(first, second)
     else:
         # replace n-dash with dash
         path = second.replace("−", "-")
         return '{}<a href="{}{}" target="_blank">{}</a>'.format(first, baseurl, path, second)
Exemplo n.º 18
0
 def test_match_many(self):
     p = regex.build_regex("ab[cde]fg")
     result = regex.match(p, "abcfg")
     self.assertTrue(result)
     result = regex.match(p, "abdfg")
     self.assertTrue(result)
     result = regex.match(p, "abefg")
     self.assertTrue(result)
     result = regex.match(p, "abfg")
     self.assertFalse(result)
def hey(what):
    what = what.strip()
    plain = regex.sub(u'[\s{}]+'.format(regex.escape(punctuation)), '', what)

    if not regex.match(u'^[\d\s]+$', plain):
      if regex.match(u'^\s*$', what):
        return 'Fine. Be that way!'
      if regex.match(u'^[\d\s\p{Lu}]+$', plain):
        return 'Whoa, chill out!'
    if what.endswith('?'):
      return 'Sure.'
    return 'Whatever.'
Exemplo n.º 20
0
def catchup(coordinate_i, line_j, filehandle_j, chrom_sequence):

    '''
    Keep reading the j_th vcf file until it hits (or goes past) the i_th coordinate, at which time the function stops reading and you can do stuff.
    Returns (True, Vcf_line_j)  if the j_th vcf file contains an entry that matches the i_th coordinate.
    Returns (False, Vcf_line_j) if the j_th vcf file does not contain such an entry, and therefore the function has run past the i_th coordinate, by which time the programmer can decide to move into the next i_th coordiate.
    '''

    coordinate_j = re.match( pattern_chr_position, line_j )

    if coordinate_j:
        coordinate_j = coordinate_j.group()
    else:
        coordinate_j = ''

    # Which coordinate is behind?
    is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )

    # The file_j is already ahead, return the same line_j, but tag it "False"
    if is_behind == 0:
        reporter = (False, line_j)

    # The two coordinates are the same, return the same line_j, but tag it
    # "True"
    elif is_behind == 10:
        reporter = (True, line_j)

    # If file_j is behind, then needs to catch up:
    elif is_behind == 1:

        # Keep at it until line_j is no longer behind:
        while is_behind == 1:

            # Catch up
            line_j = filehandle_j.readline().rstrip()
            next_coord = re.match( pattern_chr_position, line_j )

            if next_coord:
                coordinate_j = next_coord.group()
            else:
                coordinate_j = ''

            is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )

        # If file_j has caught up exactly to the position of coordinate_i:
        if is_behind == 10:
            reporter = (True, line_j)

        # If file_j has run past coordinate_i:
        elif is_behind == 0:
            reporter = (False, line_j)

    return reporter
Exemplo n.º 21
0
def readDrugLabelsFile(infilename):
    drug = ""
    genes = ""
    x = 0
    table = {}
    try:
        with open(infilename, 'r') as f:
            lineRaw = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
            for line in lineRaw:
                if x > START:
                    print("Drug Labels : %s - %s" % (x,line[1]))
                    try:
                        match = regex.match(r'FDA Label for ([a-zA-Z0-9 \-,]+) and (.+)', line[1], regex.M|regex.I)
                        if not match:
                            match = regex.match(r'European Medicines Agency \(EMA\) Label for ([a-zA-Z0-9 \-,]+) and (.+)', line[1], regex.M|regex.I)
                        if(match):
                            drug = match.group(1)
                            genes = match.group(2)

                        else :
                            match = regex.match(r'FDA Label for ([a-zA-Z0-9 \-,]+)', line[1], regex.M|regex.I)
                            if not match:
                                match = regex.match(r'European Medicines Agency \(EMA\) Label for ([a-zA-Z0-9 \-,]+)', line[1], regex.M|regex.I)
                            if (match):
                                drug = match.group(1)
                                genes = ""
                            else:
                                print("ERROR ON ROW - readDrugLabelsFile : %s - %s" % (line[1]))

                        if len(drug) > 0:
                            if drug in table:
                                row = table[drug]
                            else:
                                row = {}
                                table[drug] = row
                            if(len(genes) > 0):
                                for gene in genes.split(","):
                                    row[gene] = {gene, line[2], line[3],line[4]}
                            else :
                                row[NO_GENE] = {NO_GENE, line[2], line[3],line[4]}

                        drug = ""
                        genes = ""
                    except:
                         print("ERROR ON ROW - readDrugLabelsFile 2: %s - %s" % (x,line[0]))

                if x > END:
                    return table
                x = x + 1
    finally:
        f.close()
    return table
Exemplo n.º 22
0
def untag(string, default_lv='und-000'):
    ap = Ap()
    string = string.replace('⁋⫷mn', '⫷mn')
    string = string.replace('⁋', '⫷mn⫸')
    string = string.replace('‣⫷ex', '⫷ex')
    string = string.replace('‣', '⫷ex⫸')
    mn = None
    dn = None
    superclass_ex = None
    attribute = None
    tag_split = [s for s in re.split(r'(⫷.*?⫸[^⫷]+)', string) if s]
    if not tag_split: return ap
    if tag_split[0] != '⫷mn⫸':
        tag_split.insert(0, '⫷mn⫸')
    if not tag_split[1].startswith('⫷'):
        tag_split[1] = '⫷ex⫸' + tag_split[1]
    for s in tag_split:
        obj, lv, text = re.search(r'⫷(.+?)(?::([a-z]{3}-\d{3}))?⫸(.*)', s).groups()
        if not lv: lv = default_lv
        if obj == 'mn':
            if mn: ap.append(mn)
            mn = Mn()
        if obj == 'ex':
            if dn: mn.dn_list.append(dn)
            dn = Dn(Ex(text, lv))
        if obj == 'df':
            mn.df_list.append(Df(text, lv))
        if re.match(r'[dm]cs[12]?', obj):
            if obj[-1] == '2':
                superclass_ex = Ex(text, lv)
            else:
                if obj.startswith('m'):
                    mn.cs_list.append(Cs(Ex(text, lv), superclass_ex))
                if obj.startswith('d'):
                    dn.cs_list.append(Cs(Ex(text, lv), superclass_ex))
                superclass_ex = None
        if re.match(r'[dm]pp', obj):
            if attribute is None:
                attribute = Ex(text, lv)
            else:
                if obj.startswith('m'):
                    mn.pp_list.append(Pp(text, attribute))
                if obj.startswith('d'):
                    dn.pp_list.append(Pp(text, attribute))
                attribute = None
    if dn: mn.dn_list.append(dn)
    if mn: ap.append(mn)
    return ap
Exemplo n.º 23
0
    def clean_keyword(self):
        keyword = self.cleaned_data.get("keyword", "").strip()

        if keyword == "" or (keyword and not regex.match("^[\d\*\#]+$", keyword, flags=regex.UNICODE)):
            raise forms.ValidationError(_("USSD code must contain only *,# and numbers"))

        return keyword
Exemplo n.º 24
0
	def readheaders(self):
		self.headers = list = []
		self.status = ''
		headerseen = 0
		while 1:
			line = self.fp.readline()
			if not line:
				self.status = 'EOF in headers'
				break
			if self.islast(line):
				break
			elif headerseen and line[0] in ' \t':
				# It's a continuation line.
				list.append(line)
			elif regex.match('^[!-9;-~]+:', line) >= 0:
				# It's a header line.
				list.append(line)
				headerseen = 1
			else:
				# It's not a header line; stop here.
				if not headerseen:
					self.status = 'No headers'
				else:
					self.status = 'Bad header'
				# Try to undo the read.
				try:
					self.fp.seek(-len(line), 1)
				except IOError:
					self.status = \
						self.status + '; bad seek'
				break
Exemplo n.º 25
0
def split_example_file(example, dst_dir):
    lines = open(example).readlines()

    target_lines = []
    target_require_lines = []

    found_requires = False
    found_code = False
    for line in lines:
        m = re.match(r'goog.require\(\'(.*)\'\);', line)
        if m:
            found_requires = True
            target_require_lines.append(line)
        elif found_requires:
            if found_code or line not in ('\n', '\r\n'):
                found_code = True
                target_lines.append(line)

    target = open(
        os.path.join(dst_dir, os.path.basename(example)), 'w')
    target_require = open(
        os.path.join(dst_dir, os.path.basename(example)
          .replace('.js', '-require.js')),
        'w')

    target.writelines(target_lines)
    target.close()

    target_require.writelines(target_require_lines)
    target_require.close()
Exemplo n.º 26
0
    def process(self, srcfo):
        fields = set()
        current = None

        outfo = tempfile.TemporaryFile('w+')
        
        outfo.write('sc.lzh2enFallbackData = {')
        
        for lineno, line in enumerate(srcfo):
            if line.startswith('#') or line.isspace():
                continue
            m = regex.match(r'U\+(?<code>\w+)\s+(?<field>\w+)\s+(?<content>.*)', line)
            if not m:
                print('{}: {}'.format(lineno + 1,line))
            fields.add(m['field'])
            
            if not current:
                current = {'code': m['code']}
            elif current['code'] != m['code']:
                self.writeout(current, outfo)
                current = {'code': m['code']}

            current[m['field']] = m['content']

        outfo.write('\n}\n')
        outfo.flush()
        outfo.seek(0)
        return outfo.read()
Exemplo n.º 27
0
def test():
    s = 'aaa(((1+0)+1)+1)bbb'
    db_id_type_str = r'(.+):(.+)\((.+)'

    str1 = "NDFRT:N0000002071(Mycobacterium Infections [Disease/Finding])"
    db_id_type_strB = "(.+):(.+)\((.+)\[(.+)/(.+)\]"

    str2 = "SnoMedCT:109978004(T-cell lymphoma (clinical)"
    db_id_PP = r'(.+):(.+)\(([^()]++)\((.*)\)+'

    str3 = "MeSH:D015430(Weight Gain"
    strExp = "(.+):(.+)\((.+)"

    str4 = "MeSH:D015430(Weight Gain"
    match = regex.match(db_id_PP, str4, regex.M|regex.I)


    if match:
      print "matchObj.group() : ", match.group()
      print "matchObj.group(1) : ", match.group(1)
      print "matchObj.group(2) : ", match.group(2)
      print "matchObj.group(3) : ", match.group(3)
      print "matchObj.group(4) : ", match.group(4)
      print "matchObj.group(5) : ", match.group(5)
    else:
      print "No match!!"
Exemplo n.º 28
0
def testSlurpy(testCase):
    m = re.match(reSlimp, testCase)
    if m:
        postFix = testCase[len(m.group()):]
        return re.fullmatch(reSlump, postFix) != None
    else:
        return False
Exemplo n.º 29
0
def read_file(f):
    """
        Reads a ply file and outputs a list of points.
    """
    import regex
    f.readline()

    binary = True if "binary" in f.readline() else False
    if binary:
        import os

        name = convert_to_ascii(f)
        ascii_f = open(name)
        points = read_file(ascii_f)
        ascii_f.close()
        os.system("rm " + name)
        return points
    else:
        points = []
        line = f.readline()
        while line:
            number = r"[\d\w-.]+"
            match = regex.match(r"^\s*(" + number + r")\s+(" + number + r")\s+(" + number + r")\s*$",line)
            try:
                points.append(Point(float(match.group(1)),float(match.group(2)),float(match.group(3))))
            except ValueError:
                pass
            except AttributeError:
                pass
            line = f.readline()
        return points
Exemplo n.º 30
0
def dispatch_shorthand_command(msg):
    commands = GlobalVars.parser.unescape(msg.content[3:]).split()

    output = []
    processed_commands = []

    for cmd in commands:
        count, cmd = regex.match(r"^(\d*)(.*)", cmd).groups()

        for _ in range(int(count) if count else 1):
            processed_commands.append(cmd)

    should_return_output = False

    for current_command, message in zip(processed_commands, get_last_messages(msg.room, len(processed_commands))):
        if current_command == "-":
            output.append("[:{}] <skipped>".format(message.id))
        else:
            result = dispatch_reply_command(message, msg, current_command)

            if result:
                should_return_output = True
                output.append("[:{}] {}".format(message.id, result))
            else:
                output.append("[:{}] <processed without return value>".format(message.id))

    return "\n".join(output) if should_return_output else ""
Exemplo n.º 31
0
def get_sheets_for_ref(tref, pad=True, context=1):
    """
	Returns a list of sheets that include ref,
	formating as need for the Client Sidebar.
	"""
    oref = model.Ref(tref)
    if pad:
        oref = oref.padded_ref()
    if context:
        oref = oref.context_ref(context)

    ref_re = oref.regex()

    results = []

    regex_list = oref.regex(as_list=True)
    ref_clauses = [{"sources.ref": {"$regex": r}} for r in regex_list]
    sheets = db.sheets.find({
        "$or": ref_clauses,
        "status": "public"
    }, {
        "id": 1,
        "title": 1,
        "owner": 1,
        "sources.ref": 1,
        "views": 1
    }).sort([["views", -1]])
    for sheet in sheets:
        matched_refs = []
        if "sources" in sheet:
            for source in sheet["sources"]:
                if "ref" in source:
                    matched_refs.append(source["ref"])
        matched_refs = [r for r in matched_refs if regex.match(ref_re, r)]
        for match in matched_refs:
            try:
                match = model.Ref(match)
            except InputError:
                continue
            ownerData = public_user_data(sheet["owner"])
            com = {
                "category":
                "Sheets",
                "type":
                "sheet",
                "owner":
                sheet["owner"],
                "_id":
                str(sheet["_id"]),
                "anchorRef":
                match.normal(),
                "anchorVerse":
                match.sections[-1] if len(match.sections) else 1,
                "public":
                True,
                "commentator":
                user_link(sheet["owner"]),  # legacy, used in S1
                "text":
                "<a class='sheetLink' href='/sheets/%d'>%s</a>" %
                (sheet["id"], strip_tags(
                    sheet["title"])),  # legacy, used in S1
                "title":
                strip_tags(sheet["title"]),
                "sheetUrl":
                "/sheets/" + str(sheet["id"]),
                "ownerName":
                ownerData["name"],
                "ownerProfileUrl":
                ownerData["profileUrl"],
                "ownerImageUrl":
                ownerData["imageUrl"],
                "views":
                sheet["views"]
            }

            results.append(com)

    return results
Exemplo n.º 32
0
        isIndication = 0
        isDosage = 0
        nodestart = 0
        nodeend = 0
        indTl = None
        dosTl = None
        for child in ET.fromstring(section).iter():
            if child.tag == "code":
                if "code" in child.attrib:
                    if child.attrib['code'] == "34068-7":
                        isDosage = 1
                    elif child.attrib['code'] == "34067-9":
                        isIndication = 1

            if isIndication == 0 and child.tag == "title":
                if re.match(r".*\b(?:INDICATION|INDICATIONS)\b.*",str(ET.tostring(child).decode("utf-8")),re.IGNORECASE | re.DOTALL)\
                and len(indicText)==0:
                    isIndication = 1

            if isDosage == 0 and child.tag == "title":
                if re.match(r".*\bDOSAGE\b.*\bADMINISTRATION\b.*",str(ET.tostring(child).decode("utf-8")),re.IGNORECASE | re.DOTALL)\
                and len(dosText)==0:
                    isDosage = 1

            if isIndication == 1 and child.tag != "code":
                if ET.fromstring(section).find("text") != None:
                    for text in ET.fromstring(section).findall("./text"):
                        indicTitles.append("")
                        indicText.append(
                            str(
                                BeautifulSoup(
Exemplo n.º 33
0
    def __init__(self, mainWin, modulesWithNewerFileDates):
        super(DialogPluginManager, self).__init__(mainWin.parent)
        
        self.ENABLE = _("Enable")
        self.DISABLE = _("Disable")
        self.parent = mainWin.parent
        self.cntlr = mainWin
        
        # copy plugins for temporary display
        self.pluginConfig = PluginManager.pluginConfig
        self.pluginConfigChanged = False
        self.uiClassMethodsChanged = False
        self.modelClassesChanged = False
        self.customTransformsChanged = False
        self.disclosureSystemTypesChanged = False
        self.hostSystemFeaturesChanged = False
        self.modulesWithNewerFileDates = modulesWithNewerFileDates
        
        parentGeometry = re.match("(\d+)x(\d+)[+]?([-]?\d+)[+]?([-]?\d+)", self.parent.geometry())
        dialogX = int(parentGeometry.group(3))
        dialogY = int(parentGeometry.group(4))

        self.title(_("Plug-in Manager"))
        frame = Frame(self)
        
        # left button frame
        buttonFrame = Frame(frame, width=40)
        buttonFrame.columnconfigure(0, weight=1)
        addLabel = Label(buttonFrame, text=_("Find plug-in modules:"), wraplength=60, justify="center")
        addSelectLocalButton = Button(buttonFrame, text=_("Select"), command=self.selectLocally)
        ToolTip(addSelectLocalButton, text=_("Select python module files from the local plugin directory."), wraplength=240)
        addBrowseLocalButton = Button(buttonFrame, text=_("Browse"), command=self.browseLocally)
        ToolTip(addBrowseLocalButton, text=_("File chooser allows browsing and selecting python module files to add (or reload) plug-ins, from the local file system."), wraplength=240)
        addWebButton = Button(buttonFrame, text=_("On Web"), command=self.findOnWeb)
        ToolTip(addWebButton, text=_("Dialog to enter URL full path to load (or reload) plug-ins, from the web or local file system."), wraplength=240)
        addLabel.grid(row=0, column=0, pady=4)
        addSelectLocalButton.grid(row=1, column=0, pady=4)
        addBrowseLocalButton.grid(row=2, column=0, pady=4)
        addWebButton.grid(row=3, column=0, pady=4)
        buttonFrame.grid(row=0, column=0, rowspan=3, sticky=(N, S, W), padx=3, pady=3)
        
        # right tree frame (plugins already known to arelle)
        modulesFrame = Frame(frame, width=720)
        vScrollbar = Scrollbar(modulesFrame, orient=VERTICAL)
        hScrollbar = Scrollbar(modulesFrame, orient=HORIZONTAL)
        self.modulesView = Treeview(modulesFrame, xscrollcommand=hScrollbar.set, yscrollcommand=vScrollbar.set, height=7)
        self.modulesView.grid(row=0, column=0, sticky=(N, S, E, W))
        self.modulesView.bind('<<TreeviewSelect>>', self.moduleSelect)
        hScrollbar["command"] = self.modulesView.xview
        hScrollbar.grid(row=1, column=0, sticky=(E,W))
        vScrollbar["command"] = self.modulesView.yview
        vScrollbar.grid(row=0, column=1, sticky=(N,S))
        modulesFrame.columnconfigure(0, weight=1)
        modulesFrame.rowconfigure(0, weight=1)
        modulesFrame.grid(row=0, column=1, columnspan=4, sticky=(N, S, E, W), padx=3, pady=3)
        self.modulesView.focus_set()

        self.modulesView.column("#0", width=120, anchor="w")
        self.modulesView.heading("#0", text=_("Name"))
        self.modulesView["columns"] = ("author", "ver", "status", "date", "update", "descr", "license")
        self.modulesView.column("author", width=100, anchor="w", stretch=False)
        self.modulesView.heading("author", text=_("Author"))
        self.modulesView.column("ver", width=60, anchor="w", stretch=False)
        self.modulesView.heading("ver", text=_("Version"))
        self.modulesView.column("status", width=50, anchor="w", stretch=False)
        self.modulesView.heading("status", text=_("Status"))
        self.modulesView.column("date", width=70, anchor="w", stretch=False)
        self.modulesView.heading("date", text=_("File Date"))
        self.modulesView.column("update", width=50, anchor="w", stretch=False)
        self.modulesView.heading("update", text=_("Update"))
        self.modulesView.column("descr", width=200, anchor="w", stretch=False)
        self.modulesView.heading("descr", text=_("Description"))
        self.modulesView.column("license", width=70, anchor="w", stretch=False)
        self.modulesView.heading("license", text=_("License"))

        classesFrame = Frame(frame)
        vScrollbar = Scrollbar(classesFrame, orient=VERTICAL)
        hScrollbar = Scrollbar(classesFrame, orient=HORIZONTAL)
        self.classesView = Treeview(classesFrame, xscrollcommand=hScrollbar.set, yscrollcommand=vScrollbar.set, height=5)
        self.classesView.grid(row=0, column=0, sticky=(N, S, E, W))
        hScrollbar["command"] = self.classesView.xview
        hScrollbar.grid(row=1, column=0, sticky=(E,W))
        vScrollbar["command"] = self.classesView.yview
        vScrollbar.grid(row=0, column=1, sticky=(N,S))
        classesFrame.columnconfigure(0, weight=1)
        classesFrame.rowconfigure(0, weight=1)
        classesFrame.grid(row=1, column=1, columnspan=4, sticky=(N, S, E, W), padx=3, pady=3)
        self.classesView.focus_set()
        
        self.classesView.column("#0", width=200, anchor="w")
        self.classesView.heading("#0", text=_("Class"))
        self.classesView["columns"] = ("modules",)
        self.classesView.column("modules", width=500, anchor="w", stretch=False)
        self.classesView.heading("modules", text=_("Modules"))
        
        # bottom frame module info details
        moduleInfoFrame = Frame(frame, width=700)
        moduleInfoFrame.columnconfigure(1, weight=1)
        
        self.moduleNameLabel = Label(moduleInfoFrame, wraplength=600, justify="left", 
                                     font=font.Font(family='Helvetica', size=12, weight='bold'))
        self.moduleNameLabel.grid(row=0, column=0, columnspan=4, sticky=W)
        self.moduleAuthorHdr = Label(moduleInfoFrame, text=_("author:"), state=DISABLED)
        self.moduleAuthorHdr.grid(row=1, column=0, sticky=W)
        self.moduleAuthorLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleAuthorLabel.grid(row=1, column=1, columnspan=3, sticky=W)
        self.moduleDescrHdr = Label(moduleInfoFrame, text=_("description:"), state=DISABLED)
        self.moduleDescrHdr.grid(row=2, column=0, sticky=W)
        self.moduleDescrLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleDescrLabel.grid(row=2, column=1, columnspan=3, sticky=W)
        self.moduleClassesHdr = Label(moduleInfoFrame, text=_("classes:"), state=DISABLED)
        self.moduleClassesHdr.grid(row=3, column=0, sticky=W)
        self.moduleClassesLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleClassesLabel.grid(row=3, column=1, columnspan=3, sticky=W)
        ToolTip(self.moduleClassesLabel, text=_("List of classes that this plug-in handles."), wraplength=240)
        self.moduleVersionHdr = Label(moduleInfoFrame, text=_("version:"), state=DISABLED)
        self.moduleVersionHdr.grid(row=4, column=0, sticky=W)
        self.moduleVersionLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleVersionLabel.grid(row=4, column=1, columnspan=3, sticky=W)
        ToolTip(self.moduleVersionLabel, text=_("Version of plug-in module."), wraplength=240)
        self.moduleUrlHdr = Label(moduleInfoFrame, text=_("URL:"), state=DISABLED)
        self.moduleUrlHdr.grid(row=5, column=0, sticky=W)
        self.moduleUrlLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleUrlLabel.grid(row=5, column=1, columnspan=3, sticky=W)
        ToolTip(self.moduleUrlLabel, text=_("URL of plug-in module (local file path or web loaded file)."), wraplength=240)
        self.moduleDateHdr = Label(moduleInfoFrame, text=_("date:"), state=DISABLED)
        self.moduleDateHdr.grid(row=6, column=0, sticky=W)
        self.moduleDateLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleDateLabel.grid(row=6, column=1, columnspan=3, sticky=W)
        ToolTip(self.moduleDateLabel, text=_("Date of currently loaded module file (with parenthetical node when an update is available)."), wraplength=240)
        self.moduleLicenseHdr = Label(moduleInfoFrame, text=_("license:"), state=DISABLED)
        self.moduleLicenseHdr.grid(row=7, column=0, sticky=W)
        self.moduleLicenseLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleLicenseLabel.grid(row=7, column=1, columnspan=3, sticky=W)
        self.moduleImportsHdr = Label(moduleInfoFrame, text=_("imports:"), state=DISABLED)
        self.moduleImportsHdr.grid(row=8, column=0, sticky=W)
        self.moduleImportsLabel = Label(moduleInfoFrame, wraplength=600, justify="left")
        self.moduleImportsLabel.grid(row=8, column=1, columnspan=3, sticky=W)
        self.moduleEnableButton = Button(moduleInfoFrame, text=self.ENABLE, state=DISABLED, command=self.moduleEnable)
        ToolTip(self.moduleEnableButton, text=_("Enable/disable plug in."), wraplength=240)
        self.moduleEnableButton.grid(row=9, column=1, sticky=E)
        self.moduleReloadButton = Button(moduleInfoFrame, text=_("Reload"), state=DISABLED, command=self.moduleReload)
        ToolTip(self.moduleReloadButton, text=_("Reload/update plug in."), wraplength=240)
        self.moduleReloadButton.grid(row=9, column=2, sticky=E)
        self.moduleRemoveButton = Button(moduleInfoFrame, text=_("Remove"), state=DISABLED, command=self.moduleRemove)
        ToolTip(self.moduleRemoveButton, text=_("Remove plug in from plug in table (does not erase the plug in's file)."), wraplength=240)
        self.moduleRemoveButton.grid(row=9, column=3, sticky=E)
        moduleInfoFrame.grid(row=2, column=0, columnspan=5, sticky=(N, S, E, W), padx=3, pady=3)
        moduleInfoFrame.config(borderwidth=4, relief="groove")
        
        okButton = Button(frame, text=_("Close"), command=self.ok)
        ToolTip(okButton, text=_("Accept and changes (if any) and close dialog."), wraplength=240)
        cancelButton = Button(frame, text=_("Cancel"), command=self.close)
        ToolTip(cancelButton, text=_("Cancel changes (if any) and close dialog."), wraplength=240)
        okButton.grid(row=3, column=3, sticky=(S,E), pady=3)
        cancelButton.grid(row=3, column=4, sticky=(S,E), pady=3, padx=3)
        
        enableDisableFrame = Frame(frame)
        enableDisableFrame.grid(row=3, column=1, sticky=(S,W), pady=3)
        enableAllButton = Button(enableDisableFrame, text=_("Enable All"), command=self.enableAll)
        ToolTip(enableAllButton, text=_("Enable all plug ins."), wraplength=240)
        disableAllButton = Button(enableDisableFrame, text=_("Disable All"), command=self.disableAll)
        ToolTip(disableAllButton, text=_("Disable all plug ins."), wraplength=240)
        enableAllButton.grid(row=1, column=1)
        disableAllButton.grid(row=1, column=2)
        
        self.loadTreeViews()

        self.geometry("+{0}+{1}".format(dialogX+50,dialogY+100))
        frame.grid(row=0, column=0, sticky=(N,S,E,W))
        frame.columnconfigure(0, weight=0)
        frame.columnconfigure(1, weight=1)
        frame.rowconfigure(0, weight=1)
        window = self.winfo_toplevel()
        window.columnconfigure(0, weight=1)
        window.rowconfigure(0, weight=1)
        
        self.bind("<Return>", self.ok)
        self.bind("<Escape>", self.close)
        
        self.protocol("WM_DELETE_WINDOW", self.close)
        self.grab_set()
        self.wait_window(self)
Exemplo n.º 34
0
		try:
			data['author'] = doc.info[0]['Author'].decode("utf-16")
		except UnicodeDecodeError:
			data['author'] = str(doc.info[0]['Author'])
except:
	a = 0  # do nothing
'''

# Recuperation des donnees

copy = 'author'  # La section en cours de copie ('' : aucune)
# copied = [] (voir debut du programme)
cpt = 0
for line in lines:
    # Abstract
    if reg.match(r'^.{0,5}abstract(s)?.{0,100}\n',
                 line.lower()) and not 'abstract' in copied:
        copy = 'abstract'
        copied.append(copy)
    # Introduction
    elif reg.match(r'^.{0,5}introduction(s)?.{0,3}\n',
                   line.lower()) and not 'introduction' in copied:
        copy = 'introduction'
        copied.append(copy)
    # Fin Abstract
    #elif str("1.\n") in line or str("I.\n") in line or str("1.\n") in line:
    elif 'abstract' in copied and reg.match(
            r'^[1|I].{0,40}+\n', line) and not 'introduction' in copied:
        copy = ''
    # Fin Introduction
    elif reg.match(r'^2[\.|\ .+|\n]', line.lower()):
        copy = 'corps'  ##fin d'introduction,on set le flag pour le corps
Exemplo n.º 35
0
 def _quote_identifier(self, name):
     if re.match(r'^[A-Za-z][A-Za-z_0-9]*$', name):
         return name
     return '`{}`'.format(name)
Exemplo n.º 36
0
    def _parse_map_uri(self):
        """
        Parse and validate the map URI, for map-reduce processing.

        The map URI can point to the output URIs of previous workflow steps.

        The map URI template value can take the following forms:
            ${workflow->input-name}: 'input-name' must be part of workflow-level
                inputs (i.e., self._inputs)
            ${step-name->output}: 'step-name' must be a valid step name, and
                must be listed in the 'depend' list.

        Args:
            self: class instance.

        Returns:
            On success: True.
            On failure: False.

        """
        if not self._step['map']['uri']:
            # map URI is an optional definition field
            self._map_uri = []
        else:
            match = re.match(r'\${([^{}]+)->([^{}]+)}',
                             self._step['map']['uri'])
            if match:
                if match.group(
                        1) == 'workflow':  # use workflow-level input uri
                    # check if uri name is in input list
                    if match.group(2) in self._inputs:
                        # make sure the input URIs to be used as the map URIs
                        # are valid
                        for input_uri in self._inputs[match.group(2)]:
                            parsed_map_uri = URIParser.parse(input_uri)
                            if not parsed_map_uri:
                                msg = 'invalid map uri for inputs.{}: {}'\
                                    .format(
                                        match.group(2),
                                        input_uri
                                    )
                                Log.an().error(msg)
                                return self._fatal(msg)
                            self._parsed_map_uris.append(parsed_map_uri)
                            self._map_uris.append(
                                parsed_map_uri['chopped_uri'])

                    else:
                        msg = 'invalid template reference to input: {}'\
                            .format(self._step['map']['uri'])
                        Log.an().error(msg)
                        return self._fatal(msg)

                else:  # use uri from previous step
                    # check if previous step is a dependency
                    if match.group(1) in self._step['depend']:
                        if match.group(2) == 'output':
                            self._map_uris.append(self._depend_uris[
                                match.group(1)][0]['chopped_uri'])
                            self._parsed_map_uris.append(
                                self._depend_uris[match.group(1)][0])

                        else:
                            msg = 'invalid template reference, must be "output": {}'\
                                .format(self._step['map']['uri'])
                            Log.an().error(msg)
                            return self._fatal(msg)

                    else:
                        # error, not a dependency
                        msg = 'template reference to step must be listed as dependent: {}'\
                            .format(self._step['map']['uri'])
                        Log.an().error(msg)
                        return self._fatal(msg)

            else:
                # invalid format
                msg = 'invalid template value for step map uri: {}'.format(
                    self._step['map']['uri'])
                Log.an().error(msg)
                return self._fatal(msg)

        return True
Exemplo n.º 37
0
# Conor Rabbitte
# Testing for Thompsons Construction

import regex

# If statement checks that we are in the main method
if __name__ == "__main__":
    # Dictionary tests holds test information
    tests = [["a.b|b*", "bbbb", True], ["a.b|b*", "abbb", False],
             ["a?.b.c", "abc", True], ["a?.b.c", "aaaaabc", False],
             ["a+.b.c", "aaaaaabc", True], ["a+.b.c", "abbbc", False],
             ["b*", "b", True], ["b*", "", True], ["b*", "abbbb", False]]

# Checks each test in tests and asserts them against the regex match function
for test in tests:
    assert regex.match(test[0], test[1]) == test[2], test[0] + \
    (" should match " if test[2] else " should not match ") + test[1]
    # Print message that tells the user is the test 'Passed' or 'Failed'
    print("Test: " + test[0] + ", " + test[1] +
          ("\t\tPassed" if test[2] else "\t\tFailed"))
Exemplo n.º 38
0
def algorithm(Edgepair_Set, i, j, k):


    # start cycle count
    cycle = 0
    # initiate the first cycle
    CYC = []
    UsedPair = []

    while len(Edgepair_Set) != len(UsedPair):

        for pair in Edgepair_Set:
            while pair not in UsedPair:
                # print('-------------new cycle--------------')
                inputpair = pair
                # print('input: ')
                # print(inputpair)

                # runs while the next input pair is not already part of the cycle.
                # returns FALSE if a cycle contains both oriented passes of a graph edge

                while inputpair not in CYC:

                    # print('input not in CYC')
                    # print(inputpair)

                    basic_pattern1 = '[[:digit:]](\\+|-)'  # eg: '1+'
                    basic_pattern2 = '[h][[:digit:]]'  # eg: 'h1'
                    basic_pattern3 = '[h][[:digit:]](\\+|-)'  # eg: 'h1+'
                    basic_pattern4 = '[v][[:digit:]]'  # eg: 'v1'
                    basic_pattern5 = '[v][[:digit:]](\\+|-)'  # eg: 'v1+'

                    # select edge next to input pair

                    # case: b = i
                    if inputpair[1].isdigit() == True:
                        # print('case: b = i')
                        for str in Edgepair_Set:
                            if str[0] == inputpair[1] + '+' or str[0] == inputpair[1] + '-':
                                if str[2] == multiplication(str[0][1], inputpair[2]):
                                    # print('out: [i+,...]')
                                    outputpair = str
                                    if outputpair[2] == '+':
                                        outputpair_ = [outputpair[1], outputpair[0], '-']
                                    elif outputpair[2] == '-':
                                        outputpair_ = [outputpair[1], outputpair[0], '+']

                                    if outputpair_ in CYC:
                                        return False
                                    break

                    # case: b = i+/-
                    elif inputpair[1][0].isdigit() == True and inputpair[1].isdigit() == False:
                        # print('case: b = i+')
                        for str in Edgepair_Set:
                            if str[0] == inputpair[1][0] and str[2] == negative(
                                    multiplication(inputpair[2], inputpair[1][1])):
                                outputpair = str
                                if outputpair[2] == '+':
                                    outputpair_ = [outputpair[1], outputpair[0], '-']
                                elif outputpair[2] == '-':
                                    outputpair_ = [outputpair[1], outputpair[0], '+']

                                if outputpair_ in CYC:
                                    return False
                                break

                    # case: b = hj
                    elif inputpair[1][0] == 'h' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 2:
                        # print('case: b = hj')
                        for str in Edgepair_Set:

                            if str[0] == inputpair[1] + '+' or str[0] == inputpair[1] + '-':
                                if str[2] == multiplication(inputpair[2], str[0][2]):
                                    outputpair = str
                                    if outputpair[2] == '+':
                                        outputpair_ = [outputpair[1], outputpair[0], '-']
                                    elif outputpair[2] == '-':
                                        outputpair_ = [outputpair[1], outputpair[0], '+']

                                    if outputpair_ in CYC:
                                        return False


                    # case: b = hj+/-
                    elif inputpair[1][0] == 'h' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 3:
                        # print('case: b = hj+')
                        for str in Edgepair_Set:

                            if str[0] == inputpair[1][:-1] and str[2] == negative(
                                    multiplication(inputpair[2], inputpair[1][2])):
                                # print('output: [hi,...]')
                                outputpair = str
                                if outputpair[2] == '+':
                                    outputpair_ = [outputpair[1], outputpair[0], '-']
                                elif outputpair[2] == '-':
                                    outputpair_ = [outputpair[1], outputpair[0], '+']

                                # print('find output')
                                # print(outputpair)
                                if outputpair_ in CYC:
                                    return False


                    # case: b = vk
                    elif inputpair[1][0] == 'v' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 2:
                        # print('case: b = vk')
                        for str in Edgepair_Set:

                            if str[0] == inputpair[1] + '+' or str[0] == inputpair[1] + '-':
                                if str[2] == negative(multiplication(inputpair[2], str[0][2])):
                                    outputpair = str
                                    if outputpair[2] == '+':
                                        outputpair_ = [outputpair[1], outputpair[0], '-']
                                    elif outputpair[2] == '-':
                                        outputpair_ = [outputpair[1], outputpair[0], '+']

                                    if outputpair_ in CYC:
                                        return False


                    # case: b = vk+/-
                    elif inputpair[1][0] == 'v' and inputpair[1][1].isdigit() == True and len(inputpair[1]) == 3:
                        # print('case: b = vk+')
                        for str in Edgepair_Set:
                            if str[0] == inputpair[1][:-1] and str[2] == multiplication(inputpair[2], inputpair[1][2]):
                                # print('find output [vi,..]')
                                outputpair = str
                                if outputpair[2] == '+':
                                    outputpair_ = [outputpair[1], outputpair[0], '-']
                                elif outputpair[2] == '-':
                                    outputpair_ = [outputpair[1], outputpair[0], '+']

                                if outputpair_ in CYC:
                                    return False
                                break

                    # case: b = c & a = hi
                    elif inputpair[1] == 'c' and re.match(basic_pattern2, inputpair[0]) != None:
                        for str in Edgepair_Set:

                            if str[0] == 'c' and re.match(basic_pattern4, str[1]) != None and str[2] == inputpair[2]:
                                outputpair = str
                                if outputpair[2] == '+':
                                    outputpair_ = [outputpair[1], outputpair[0], '-']
                                elif outputpair[2] == '-':
                                    outputpair_ = [outputpair[1], outputpair[0], '+']

                                if outputpair_ in CYC:
                                    return False
                                break

                    # case: b = c & a = vk
                    elif inputpair[1] == 'c' and re.match(basic_pattern4, inputpair[0]) != None:
                        for str in Edgepair_Set:

                            if str[0] == 'c' and re.match(basic_pattern2, str[1]) != None and str[2] == negative(
                                    inputpair[2]):
                                outputpair = str
                                if outputpair[2] == '+':
                                    outputpair_ = [outputpair[1], outputpair[0], '-']
                                elif outputpair[2] == '-':
                                    outputpair_ = [outputpair[1], outputpair[0], '+']

                                if outputpair_ in CYC:
                                    return False
                                break

                    '''
                    print('output')
                    print(outputpair)
                    '''
                    # adds pair to current cycle
                    CYC.append(inputpair)
                    # continues with selected pair as input pair
                    inputpair = outputpair

                    # print('new input')
                    # print(inputpair)

                # Increments count when cycle complete
                if len(CYC) > 1:
                    cycle = cycle + 1
                    # tracks which edges have been used in a cycle
                    UsedPair.extend(CYC)
                    '''
                    print('CYC')
                    print(CYC)
                    print('cycleNum')
                    print(cycle)
                    '''
                    CYC = []

        return cycle == i + j + k + 1
Exemplo n.º 39
0
    def word_to_tuples(self, word, normpunc=False):
        """Given a word, returns a list of tuples corresponding to IPA segments.

        Args:
            word (unicode): word to transliterate
            normpunc (bool): If True, normalizes punctuation to ASCII inventory

        Returns:
            list: A list of (category, lettercase, orthographic_form,
                  phonetic_form, feature_vectors) tuples.

        The "feature vectors" form a list consisting of (segment, vector) pairs.
        For IPA segments, segment is a substring of phonetic_form such that the
        concatenation of all segments in the list is equal to the phonetic_form.
        The vectors are a sequence of integers drawn from the set {-1, 0, 1}
        where -1 corresponds to '-', 0 corresponds to '0', and 1 corresponds to
        '+'.
        """
        def cat_and_cap(c):
            cat, case = tuple(unicodedata.category(c))
            case = 1 if case == 'u' else 0
            return unicode(cat), case

        def recode_ft(ft):
            try:
                return {'+': 1, '0': 0, '-': -1}[ft]
            except KeyError:
                return None

        def vec2bin(vec):
            return map(recode_ft, vec)

        def to_vector(seg):
            return seg, vec2bin(self.ft.segment_to_vector(seg))

        def to_vectors(phon):
            if phon == '':
                return [(-1, [0] * self.num_panphon_fts)]
            else:
                return [to_vector(seg) for seg in self.ft.ipa_segs(phon)]

        tuples = []
        word = unicode(word)
        # word = self.strip_diacritics.process(word)
        word = unicodedata.normalize('NFKD', word)
        word = unicodedata.normalize('NFC', word)
        while word:
            match = re.match('[A-Za-z]+', word)
            if match:
                span = match.group(0)
                cat, case = cat_and_cap(span[0])
                phonword = self.transliterate(span)
                phonsegs = self.ft.ipa_segs(phonword)
                maxlen = max(len(phonsegs), len(span))
                orth = list(span) + [''] * (maxlen - len(span))
                phonsegs += [''] * (maxlen - len(phonsegs))
                for p, o in zip(phonsegs, orth):
                    tuples.append(('L', case, o, p, to_vectors(p)))
                word = word[len(span):]
            else:
                span = word[0]
                span = self.puncnorm.norm(span) if normpunc else span
                cat, case = cat_and_cap(span)
                cat = 'P' if normpunc and cat in self.puncnorm else cat
                phon = ''
                vecs = to_vectors(phon)
                tuples.append((cat, case, span, phon, vecs))
                word = word[1:]
        return tuples
Exemplo n.º 40
0
class Source:
    def __init__(self, path):
        self.path = path
        self.syms = set()


for dirpath, dirnames, filenames in os.walk("src/wallet"):
    for filename in filenames:
        if filename.endswith(".cpp"):
            sources.setdefault(filename, []).append(
                Source(os.path.join(dirpath, filename)))
sources["wallet.cpp"].append(Source("src/interfaces/wallet.cpp"))

for line in sys.stdin:
    m = regex.match("^([^:]+):.*? undefined reference to `(.*?)'$", line)
    if m:
        filename, sym = m.groups()
        if filename in sources:
            for source in sources[filename]:
                source.syms.add(sym)

for source in (s for sl in sources.values() for s in sl):
    with open(source.path) as fp:
        code = fp.read()

    for sym in source.syms:
        p = sym.find("(")
        if p < 0:
            pattern = r"\b(" + sym + r")\b"
            code = regex.sub(pattern, r"FIXME_IMPLEMENT_IPC_VALUE(\1)", code)
Exemplo n.º 41
0
def validate_japanese(word):
    return (not regex.match(r'^\s*$', word) and not regex.match(r'\W', word)
            and regex.match(r'\p{Hiragana}|\p{Katakana}|\p{Han}', word))
            Text_pdf_0_NL = ' '.join(all_pdf_text1.split())

            Tokens = Text_pdf_0.split()
            Labels = y_final
            Real_Tokens = Text_pdf_0_NL.split()

            autors_surname = []
            for i in range(len(autors[paper])):
                if i % 2 == 0:
                    autors_surname.append(autors[paper][i])

            autors_surname_lower = []
            for i in range(len(autors_surname)):
                autors_surname_lower.append(autors_surname[i].lower())

            if re.match('.\.', autors[paper][1]) == None:
                autors_forename = []
                for i in range(len(autors[paper])):
                    if i % 2 == 1:
                        autors_forename.append(autors[paper][i].split())

                autors_forename = list(
                    np.concatenate((autors_forename), axis=None))
                autors_forename_lower = []
                for i in range(len(autors_forename)):
                    autors_forename_lower.append(autors_forename[i].lower())

                autors_surname_lower = list(
                    np.concatenate(
                        (autors_forename_lower, autors_surname_lower),
                        axis=None))
def sortable_paragraph_number(string):
    MIN_DIGITS = 4
    digits = len(regex.match(r"^\d*", string)[0])
    if not digits:
        return string
    return "0 " * (MIN_DIGITS - digits) + string
Exemplo n.º 44
0
Arquivo: build.py Projeto: bbinet/ol3
def build_check_requires_timestamp(t):
    from zipfile import ZipFile
    unused_count = 0
    all_provides = set()
    zf = ZipFile(PLOVR_JAR)
    for zi in zf.infolist():
        if zi.filename.endswith('.js'):
            if not zi.filename.startswith('closure/goog/'):
                continue
            # Skip goog.i18n because it contains so many modules that it causes
            # the generated regular expression to exceed Python's limits
            if zi.filename.startswith('closure/goog/i18n/'):
                continue
            for line in zf.open(zi):
                m = re.match(r'goog.provide\(\'(.*)\'\);', line)
                if m:
                    all_provides.add(m.group(1))
    for filename in sorted(t.dependencies):
        if filename == 'build/src/internal/src/requireall.js':
            continue
        require_linenos = {}
        uses = set()
        lines = open(filename).readlines()
        for lineno, line in _strip_comments(lines):
            m = re.match(r'goog.provide\(\'(.*)\'\);', line)
            if m:
                all_provides.add(m.group(1))
                continue
            m = re.match(r'goog.require\(\'(.*)\'\);', line)
            if m:
                require_linenos[m.group(1)] = lineno
                continue
        ignore_linenos = require_linenos.values()
        for lineno, line in enumerate(lines):
            if lineno in ignore_linenos:
                continue
            for require in require_linenos.iterkeys():
                if require in line:
                    uses.add(require)
        for require in sorted(set(require_linenos.keys()) - uses):
            t.info('%s:%d: unused goog.require: %r' %
                   (filename, require_linenos[require], require))
            unused_count += 1
    all_provides.discard('ol')
    all_provides.discard('ol.MapProperty')

    class Node(object):
        def __init__(self):
            self.present = False
            self.children = {}

        def _build_re(self, key):
            if len(self.children) == 1:
                child_key, child = next(self.children.iteritems())
                child_re = '\\.' + child._build_re(child_key)
                if self.present:
                    return key + '(' + child_re + ')?'
                else:
                    return key + child_re
            elif self.children:
                children_re = '(?:\\.(?:' + '|'.join(
                    self.children[k]._build_re(k)
                    for k in sorted(self.children.keys())) + '))'
                if self.present:
                    return key + children_re + '?'
                else:
                    return key + children_re
            else:
                assert self.present
                return key

        def build_re(self, key):
            return re.compile('\\b' + self._build_re(key) + '\\b')

    root = Node()
    for provide in all_provides:
        node = root
        for component in provide.split('.'):
            if component not in node.children:
                node.children[component] = Node()
            node = node.children[component]
        node.present = True
    provide_res = [
        child.build_re(key) for key, child in root.children.iteritems()
    ]
    missing_count = 0
    for filename in sorted(t.dependencies):
        if filename in INTERNAL_SRC or filename in EXTERNAL_SRC:
            continue
        provides = set()
        requires = set()
        uses = set()
        uses_linenos = {}
        for lineno, line in _strip_comments(open(filename)):
            m = re.match(r'goog.provide\(\'(.*)\'\);', line)
            if m:
                provides.add(m.group(1))
                continue
            m = re.match(r'goog.require\(\'(.*)\'\);', line)
            if m:
                requires.add(m.group(1))
                continue
            while True:
                for provide_re in provide_res:
                    m = provide_re.search(line)
                    if m:
                        uses.add(m.group())
                        uses_linenos[m.group()] = lineno
                        line = line[:m.start()] + line[m.end():]
                        break
                else:
                    break
        if filename == 'src/ol/renderer/layerrenderer.js':
            uses.discard('ol.renderer.Map')
        m = re.match(r'src/ol/renderer/(\w+)/\1(\w*)layerrenderer\.js\Z',
                     filename)
        if m:
            uses.discard('ol.renderer.Map')
            uses.discard('ol.renderer.%s.Map' % (m.group(1), ))
        missing_requires = uses - requires - provides
        if missing_requires:
            for missing_require in sorted(missing_requires):
                t.info(
                    "%s:%d missing goog.require('%s')" %
                    (filename, uses_linenos[missing_require], missing_require))
                missing_count += 1
    if unused_count or missing_count:
        t.error('%d unused goog.requires, %d missing goog.requires' %
                (unused_count, missing_count))
    t.touch()
Exemplo n.º 45
0
    def _inic_dic_vars(símismo):
        símismo.variables.clear()
        símismo._conv_nombres.clear()

        internos = ['FINAL TIME', 'TIME STEP', 'SAVEPER', 'INITIAL TIME']
        for i, f in símismo.mod.doc().iterrows():

            if f['Type'] == 'lookup':
                continue

            nombre = f['Real Name']
            if nombre not in internos:
                nombre_py = f['Py Name']
                unidades = f['Unit']
                líms = literal_eval(f['Lims'])
                ec = f['Eqn']
                obj_ec = Ecuación(ec)
                var_juego = obj_ec.sacar_args_func('GAME') is not None

                if símismo.tipo_mod == '.mdl':
                    if regex.match(r'INTEG *\(', ec):
                        tipo = 'nivel'
                    else:
                        tipo = 'auxiliar'  # cambiaremos el resto después
                else:
                    try:
                        getattr(símismo.mod.components, 'integ_' + nombre_py)
                        tipo = 'nivel'
                    except AttributeError:
                        tipo = 'auxiliar'

                parientes = obj_ec.variables()

                if tipo == 'auxiliar' and not len(parientes):
                    tipo = 'constante'

                símismo.variables[nombre] = {
                    'val': getattr(símismo.mod.components, nombre_py)(),
                    'unidades': unidades,
                    'ec': ec,
                    'hijos': [],
                    'parientes': parientes,
                    'líms': líms,
                    'info': f['Comment'],
                    'tipo': tipo,
                    'ingreso': True,
                    'egreso': not var_juego
                }

                símismo._conv_nombres[nombre] = nombre_py

        # Aplicar los otros tipos de variables
        for niv in símismo.niveles():
            ec = Ecuación(símismo.obt_ec_var(niv), dialecto='vensim')
            if símismo.tipo_mod == '.mdl':
                args_integ, args_inic = ec.sacar_args_func(
                    'INTEG')  # Analizar la función INTEG de VENSIM

                # Identificar variables iniciales
                if args_inic in símismo.variables:
                    símismo.variables[args_inic]['tipo'] = 'inicial'

                # Los flujos, por definición, son los otros parientes de los niveles.
                flujos = [
                    v for v in Ecuación(args_integ,
                                        dialecto='vensim').variables()
                    if v not in internos
                ]
            else:
                flujos = ec.variables()

            for flujo in flujos:
                # Para cada nivel en el modelo...
                símismo.variables[flujo]['tipo'] = 'flujo'

        # Detectar los variables iniciales de XMILE
        if símismo.tipo_mod == '.xmile':
            for nv in ET.parse(símismo.archivo).getroot().iter(
                    '{http://www.systemdynamics.org/XMILE}stock'):
                inic = nv.find('{http://www.systemdynamics.org/XMILE}eqn').text

                if inic in símismo.variables:
                    símismo.variables[inic]['tipo'] = 'inicial'
                    símismo.variables[nv.attrib['name']]['parientes'].add(inic)

        # Aplicar parientes a hijos
        for v, d_v in símismo.variables.items():
            for p in d_v['parientes']:
                d_p = símismo.variables[p]
                d_p['hijos'].append(v)
Exemplo n.º 46
0
 def __eq__(self, other):
     if not isinstance(other, str):
         return False
     if self.pattern and not regex.match(self.pattern, other):
         return False
     return True
Exemplo n.º 47
0
    if {"byr", "iyr", "eyr", "hgt", "hcl", "ecl", "pid"} <= passport.keys():
        valid_passports += 1

print(valid_passports)

# Part Two

valid_passports = 0

for passport in passports_parsed:
    if {"byr", "iyr", "eyr", "hgt", "hcl", "ecl", "pid"} <= passport.keys():
        if ((1920 <= int(passport["byr"]) <= 2002)
                and (2010 <= int(passport["iyr"]) <= 2020)
                and (2020 <= int(passport["eyr"]) <= 2030)
                and (passport["ecl"]
                     in ["amb", "blu", "brn", "gry", "grn", "hzl", "oth"])
                and (len(passport["pid"]) == 9)):
            height_match = regex.match(r"\s*(?P<height>\d+)(?P<unit>cm|in)",
                                       passport["hgt"])
            if height_match and (
                (height_match.group("unit") == "in" and
                 (59 <= int(height_match.group("height")) <= 76)) or
                (height_match.group("unit") == "cm" and
                 (150 <= int(height_match.group("height")) <= 193))):
                hair_color_match = regex.match(r"#(?P<color>(?:[0-9]|[a-f])+)",
                                               passport["hcl"])
                if hair_color_match:
                    valid_passports += 1

print(valid_passports)
Exemplo n.º 48
0
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SLEEP_EACH_SCROLL)

        ## Retrieve the divs list
        all_divs = driver.find_elements(By.CSS_SELECTOR,
                                        "div[class='v1Nh3 kIKUG  _bz0w']")
        selected_divs = [x for x in all_divs if x not in s]
        ## Retrieve each image srcset attribute in each div in the divs list
        img_srcset = [
            div.find_element(By.CSS_SELECTOR, "img").get_attribute('srcset')
            for div in selected_divs
        ]
        pattern = re.compile('^http\S+')
        ## Retrieve the correct image url from image srcset list
        string_url_imgs = [
            re.match(pattern=pattern, string=x).group() for x in img_srcset
        ]

        for string_url_img in string_url_imgs:
            download_img_from_link(string_url_img, selfie_or_not=Boolean)

        ## Calculate new scroll height and compare with last scroll height
        ## ... (if the scrolling actually changed something)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        count += 1  # count will be used for pagination afterwards

        s = set(
            all_divs
Exemplo n.º 49
0
    def get_normalized_subnames(self,
                                src_names,
                                separate_to_names=False,
                                config=AutomataVariants.DEFAULT):
        '''
		From a list of surnames for a given person, it creates a set of all possible surnames variants respecting settings of lowercase / non-accent / ..
		For example:
		   * ["Havel"] => ["Havel"]
		   * ["O'Connor"] => ["O'Connor", "Connor"]
		   * ["van Beethoven"] => ["Ludwig", "Beethoven", "van Beethoven"]
		'''

        if AutomataVariants.isLowercase(config):
            regex_flags = regex.IGNORECASE
        else:
            regex_flags = 0

        # tmp_preposition in the form of "([Vv]an|[Zz]u|..)"
        tmp_prepositions = reUtils.list2FirstIncaseAlternation(
            self.NAME_PREPOSITIONS)
        regex_prepositions_remove = regex.compile(
            r" {} ".format(tmp_prepositions))
        regex_prepositions_name = regex.compile(
            r" {} \p{{Lu}}\p{{L}}+".format(tmp_prepositions),
            flags=regex_flags)

        # tmp_prefixes in the form og "([Dd]\\'|[Oo]\\'|..)"
        tmp_prefixes = reUtils.list2FirstIncaseAlternation(self.NAME_PREFIXES)
        regex_prefixes_only_check = regex.compile(
            r"^{}\p{{Lu}}".format(tmp_prefixes), flags=regex_flags)
        regex_prefixes_only = regex.compile(r"^{}".format(tmp_prefixes))

        str_regex_location_remove = r" (?:{}) .*".format("|".join(
            map(regex.escape, self.LOCATION_PREPOSITIONS)))
        regex_location_remove = regex.compile(str_regex_location_remove,
                                              flags=regex_flags)
        regex_name = regex.compile(
            r"^( ?(?:{})?\p{{Lu}}(\p{{L}}+)?(['-]\p{{Lu}}\p{{L}}+)*)+(?:{})?$".
            format(tmp_prefixes, str_regex_location_remove),
            flags=regex_flags
        )  # this should match only a nice name (must support prefixes)
        #		regex_name = regex.compile(r"({})?[A-Z][a-z-']+[a-zA-Z]*[a-z]+".format(tmp_prefixes)) # this should match only a nice name (must support prefixes)

        names = set()

        for name in src_names:
            # normalize whitespaces
            name = regex.sub('\s+', ' ', name)
            subname_location = regex.search(
                r"([^ ]+" + str_regex_location_remove + r")", name)
            if subname_location:
                subname_location = subname_location.group()
            # remove a part of the name with location information (e.g. " of Polestown" from the name "Richard Butler of Polestown")
            name = regex_location_remove.sub("", name)
            if name.upper() != name:
                name = name.title()

            if separate_to_names:
                # split the name only (without prepositions) to the parts
                subnames = regex_prepositions_remove.sub(" ", name).split()
            else:
                subnames = [name]

            if subname_location:
                subnames.append(subname_location)

            for subname in subnames:
                if not len(subname):
                    continue
                if subname[-1] == ",":
                    subname = subname[:-1]

                # skip invalid / forbidden names
                if subname not in self.FORBIDDEN_NAMES:
                    # normalize name to start with capital, including name with prefix (for example o'... => O'...)
                    subname = subname[0].upper() + subname[1:]
                    # remove accent, because python re module doesn't support [A-Z] for Unicode
                    subname_without_accent = remove_accent(subname)
                    result = regex_name.match(subname)
                    if result:
                        # add non-accent variant (if required) to processing (only if not same as base name)
                        for subname in [subname, subname_without_accent] if (
                                AutomataVariants.isNonaccent(config)
                                and subname != subname_without_accent) else [
                                    subname
                                ]:
                            if (AutomataVariants.isLowercase(config)):
                                subname = subname.lower()
                            names.add(subname)
                            if regex.match(regex_prefixes_only_check, subname):
                                # add also a variant with small letter starting prefix => "o'Conor"
                                if (not subname[0].islower()):
                                    names.add(subname[0].lower() + subname[1:])

                                # from "O'Connor" add also surname only without prefix => "Connor"
                                nonprefix = regex_prefixes_only.sub(
                                    '', subname)
                                names.add(nonprefix.lower(
                                ) if AutomataVariants.isLowercase(config) else
                                          nonprefix.capitalize())

            # search for names with preposition, i.e. "van Eyck"
            preposition_name = regex_prepositions_name.search(name.title())
            if preposition_name:
                match = preposition_name.group()

                # normalize name to start with capital, including name with preposition (for example "van Eyck" => "Van Eyck")
                # Warning: contain space on the beginning to avoid match "Ivan Novák" as "van Novák" => it is needed to get substring from second char
                subname = match[1:].title()
                subname_without_accent = remove_accent(subname)

                # add non-accent variant (if required) to processing (only if not same as base name)
                for subname in [subname, subname_without_accent] if (
                        AutomataVariants.isNonaccent(config)
                        and subname != subname_without_accent) else [subname]:
                    if (AutomataVariants.isLowercase(config)):
                        subname = subname.lower()
                    names.add(subname)

                    # add also a variant with small letter starting preposition => "van Eyck"
                    if (not subname[0].islower()):
                        names.add(subname[0].lower() + subname[1:])
        return names
Exemplo n.º 50
0
 def AnnotateText(self, text):
   spans = []
   cursor = 0
   for match in self.re_latin_word.finditer(text):
     start, end = match.span()
     if start > cursor:
       region = text[cursor:start]
       spans.append((region, False))
     region = text[start:end]
     spans.append((region, True))
     cursor = end
   if cursor < len(text):
     region = text[cursor:]
     spans.append((region, False))
   out_spans = []
   sent_head = True
   for start_index in range(0, len(spans)):
     span, span_is_word = spans[start_index]
     if not span_is_word:
       out_spans.append((span, False, None))
       continue
     def CheckSurfaceMatch(surface, title):
       if surface == title:
         return True
       if sent_head and surface != "I":
         norm_surface = surface[0].lower() + surface[1:]
         if norm_surface == title:
           return True
       return False
     annots = []
     tokens = []
     for index in range(start_index, len(spans)):
       token, token_is_word = spans[index]
       if token_is_word:
         tokens.append(token)
         phrase = " ".join(tokens)
         variants = []
         variants.append((phrase, 1.0))
         for infl_base in self.SearchInflections(phrase.lower()):
           variants.append((infl_base, 0.7))
         if index == start_index:
           match = self.re_aux_contraction.search(token)
           if match:
             bare = match.group(1)
             variants.append((bare, 0.7))
             for infl_base in self.SearchInflections(bare.lower()):
               variants.append((infl_base, 0.6))
             suffix = match.group(2).lower()
             if suffix == "s" and bare.lower() in ("it", "he", "she"):
               variants.append(("be", 0.0001))
             elif suffix == "ve":
               variants.append(("would", 0.0001))
             elif suffix == "d":
               variants.append(("would", 0.0001))
               variants.append(("have", 0.0001))
             elif suffix == "ll":
               variants.append(("will", 0.0001))
             elif suffix == "m" or suffix == "re":
               variants.append(("be", 0.0001))
             elif suffix == "em":
               variants.append(("them", 0.0001))
           match = self.re_not_contraction.search(token)
           if match:
             bare = match.group(1)
             lower_bare = bare.lower()
             if lower_bare == "wo": bare = "will"
             if lower_bare == "ca": bare = "can"
             if lower_bare == "sha": bare = "shall"
             variants.append((bare, 0.7))
             variants.append(("not", 0.0001))
           match = self.re_multi_possessive.search(token)
           if match:
             bare = match.group(1) + match.group(2)
             for infl_base in self.SearchInflections(bare):
               variants.append((infl_base, 0.7))
           if token.find("-") > 0:
             for part in token.split("-"):
               if not regex.search(r"\p{Latin}{3,}", part): continue
               variants.append((part, 0.0002))
               for infl_base in self.SearchInflections(part.lower()):
                 variants.append((infl_base, 0.0001))
         uniq_variants = set()
         uniq_words = set()
         for variant, var_score in variants:
           if variant in uniq_variants: continue
           uniq_variants.add(variant)
           for entry in self.SearchExact(variant, 10):
             word = entry["word"]
             if word in uniq_words: continue
             uniq_words.add(word)
             match = False
             if CheckSurfaceMatch(phrase, word):
               match = True
             else:
               for infl_name in self.infl_names:
                 infl_values = entry.get(infl_name)
                 if infl_values:
                   for infl_value in regex.split(r"[,|]", infl_values):
                     if CheckSurfaceMatch(phrase, infl_value):
                       match = True
                       break
                 if match:
                   break
             prob = float(entry.get("probability") or 0)
             prob_score = min(0.05, max(prob ** 0.5, 0.00001)) * 20
             aoa = entry.get("aoa") or entry.get("aoa_concept") or entry.get("aoa_base")
             if aoa:
               aoa = float(aoa)
             else:
               aoa = math.log(prob + 0.00000001) * -1 + 3.5
             aoa = min(max(aoa, 3), 20)
             aoa_score = (25 - min(aoa, 20.0)) / 10.0
             entry["aoa_syn"] = int(aoa)
             tran_score = 1.0 if "translation" in entry else 0.5
             item_score = math.log2(len(entry["item"]) + 1)
             labels = set()
             for item in entry["item"]:
               labels.add(item["label"])
             label_score = len(labels) + 1
             children = entry.get("child")
             child_score = math.log2((len(children) if children else 0) + 4)
             width_score = (200 if "translation" in entry else 10) ** word.count(" ")
             match_score = 1.0 if match else 0.2
             score = var_score * prob_score * aoa_score * tran_score * item_score * label_score * child_score * match_score * width_score
             annots.append((entry, score))
       elif index == start_index:
         break
       elif not regex.match(r"\s", token):
         break
       if len(tokens) > 3:
         break
     annots = sorted(annots, key=lambda x: x[1], reverse=True)
     annots = [x[0] for x in annots]
     out_spans.append((span, True, annots or None))
     sent_head = span.find("\n") >= 0 or bool(regex.search(r"[.!?;:]", span))
   return out_spans
Exemplo n.º 51
0
def roll_dice(roll, *, functions=True, floats=True):
    """
    Rolls dice in dice notation with advanced syntax used according to tinyurl.com/pydice

    :param roll: Roll in dice notation
    :return: Result of roll, and an explanation string
    """
    roll = ''.join(roll.split())
    roll = regex.sub(r'(?<=d)%', '100', roll, regex.IGNORECASE)
    roll = roll.replace('^', '**')
    roll = zero_width_split(
        r'((?<=[\(\),%^\/+*-])(?=.))|((?<=.)(?=[\(\),%^\/+*-]))', roll
    )  # Split the string on the boundary between operators and other chars

    string = []

    results = []

    for group in roll:
        if group in '()/=<>,%^+*-' or group in DEFAULT_FUNCTIONS:  #Append operators without modification
            results.append(group)
            string.append(group)
            continue
        try:
            explode = regex.match(
                r'^((\d*)d(\d+))!$', group, regex.IGNORECASE
            )  # Regex for exploding dice, ie. 2d10!, 4d100!, d12!, etc.

            specific_explode = regex.match(
                r'^((\d*)d(\d+))!(\d+)$', group
            )  # Regex for exploding dice on specific number, ie. d20!10 or d12!4

            comparison_explode = regex.match(
                r'^((\d*)d(\d+))!([<>])(\d+)$', group, regex.IGNORECASE
            )  # Regex for exploding dice with a comparison, ie. d20!>10, d6!<2

            penetrate = regex.match(
                r'^((\d*)d(\d+))!p$', group, regex.IGNORECASE
            )  # Penetrating dice are the same as exploding except any dice after the initial number are added with a -1 penalty

            specific_penetrate = regex.match(r'^((\d*)d(\d+))!p(\d+)$', group,
                                             regex.IGNORECASE)  # See above

            comparison_penetrate = regex.match(r'^((\d*)d(\d+))!p([<>])(\d+)$',
                                               group,
                                               regex.IGNORECASE)  # See above

            reroll = regex.match(
                r'^((\d*)d(\d+))([Rr])$', group,
                regex.IGNORECASE)  # Reroll on a one, matches 1d6R, 4d12r, etc.

            specific_reroll = regex.match(
                r'^((\d*)d(\d+))([Rr])(\d+)$', group,
                regex.IGNORECASE)  # Reroll on a specific number

            comparison_reroll = regex.match(
                r'^((\d*)d(\d+))([Rr])([<>])(\d+)$', group,
                regex.IGNORECASE)  # Reroll on a comparison

            success_comparison = regex.match(
                r'^((?:\d*)d(\d+))([<>])(\d+)$', group, regex.IGNORECASE
            )  # Regex for dice with comparison, ie. 2d10>4, 5d3<2, etc.

            success_fail_comparison = regex.match(
                r'^((?:\d*)d(\d+))(?|((<)(\d+)f(>)(\d+))|((>)(\d+)f(<)(\d+)))$',
                group, regex.IGNORECASE
            )  # Regex for dice with success comparison and failure comparison.

            keep = regex.match(
                r'^((?:\d*)d\d+)([Kk])(\d*)$', group, regex.IGNORECASE
            )  # Regex for keeping a number of dice, ie. 2d10K, 2d10k3, etc.

            drop = regex.match(
                r'^((?:\d*)d\d+)([Xx])(\d*)$', group,
                regex.IGNORECASE)  # As above but with dropping dice and X

            individual = regex.match(
                r'^((\d*)d(\d+))([asm])(\d+)$', group, regex.IGNORECASE
            )  #Regex for rolling dice with a modifier attached to each roll

            normal = regex.match(
                r'^((\d*)d(\d+))$', group,
                regex.IGNORECASE)  # Regex for normal dice rolls

            literal = regex.match(
                r'^(\d+)(?!\.)$', group,
                regex.IGNORECASE)  # Regex for number literals.

            float_literal = regex.match(r'^(\.\d+)|(\d+.\d+)$', group,
                                        regex.IGNORECASE)  # Regex for floats

            if explode is not None:  # Handle exploding dice without a comparison modifier.
                type_of_dice = int(explode[3])

                result = []
                last_result = roll_group(explode[1])
                result.extend(last_result)
                number_to_roll = num_equal(last_result, '=', type_of_dice)
                while number_to_roll != 0:
                    last_result = roll_group(
                        str(number_to_roll) + 'd' +
                        str(type_of_dice))  # Reroll dice
                    result.extend(last_result)
                    number_to_roll = num_equal(
                        last_result, '=', type_of_dice
                    )  # Check how many dice we have to reroll again

                results.append(sum(result))
                roll = ','.join(
                    [('!' + str(i) if i == type_of_dice else str(i))
                     for i in result]
                )  # Build a string of the dice rolls, adding an exclamation mark before every roll that resulted in an explosion.
                string.append('[%s]' % roll)

            elif specific_explode is not None:  # Handle exploding dice without a comparison modifier.
                type_of_dice = int(specific_explode[3])

                comparator = int(specific_explode[4])

                assert 0 < comparator <= type_of_dice

                result = []
                last_result = roll_group(specific_explode[1])
                result.extend(last_result)
                number_to_roll = num_equal(last_result, '=', comparator)
                while number_to_roll != 0:
                    last_result = roll_group(
                        str(number_to_roll) + 'd' + str(type_of_dice))
                    result.extend(last_result)
                    number_to_roll = num_equal(last_result, '=', comparator)

                results.append(sum(result))
                roll = ','.join(
                    [('!' + str(i) if i == comparator else str(i))
                     for i in result]
                )  # Build a string of the dice rolls, adding an exclamation mark before every roll that resulted in an explosion.
                string.append('[%s]' % roll)

            elif comparison_explode is not None:  # Handle exploding dice with a comparison modifier
                type_of_dice = int(comparison_explode[3])

                comparator = int(comparison_explode[5])

                if comparison_explode[
                        4] == '>':  # Ensure comparison is within bounds
                    assert 0 < comparator < type_of_dice
                else:
                    assert 1 < comparator <= type_of_dice

                result = []
                last_result = roll_group(comparison_explode[1])
                result.extend(last_result)
                if comparison_explode[4] == '>':
                    number_to_roll = num_equal(last_result, '>', comparator)
                    while number_to_roll != 0:
                        last_result = roll_group(
                            str(number_to_roll) + 'd' + str(type_of_dice))
                        result.extend(last_result)
                        number_to_roll = num_equal(last_result, '>',
                                                   comparator)
                    roll = ','.join(
                        [('!' + str(i) if i > comparator else str(i))
                         for i in result]
                    )  # Same as on other explodes except with a > or < comparison

                else:
                    number_to_roll = num_equal(last_result, '<', comparator)
                    while number_to_roll != 0:
                        last_result = roll_group(
                            str(number_to_roll) + 'd' + str(type_of_dice))
                        result.extend(last_result)
                        number_to_roll = num_equal(last_result, '<',
                                                   comparator)
                    roll = ','.join(
                        [('!' + str(i) if i < comparator else str(i))
                         for i in result]
                    )  # Same as on other explodes except with a > or < comparison

                results.append(sum(result))
                string.append('[%s]' % roll)

            elif penetrate is not None:  # Handle penetrating dice without a comparison modifier.
                type_of_dice = int(penetrate[3])

                first_num = int(penetrate[2])

                result = []
                last_result = roll_group(penetrate[1])
                result.extend(last_result)
                number_to_roll = num_equal(last_result, '=', type_of_dice)
                while number_to_roll != 0:
                    last_result = roll_group(
                        str(number_to_roll) + 'd' + str(type_of_dice))
                    result.extend(last_result)
                    number_to_roll = num_equal(last_result, '=', type_of_dice)

                pre_result = result[:
                                    first_num]  # Add the first rolls with no modifier
                pre_result.extend([
                    x - 1 for x in result[first_num:]
                ])  # Add the second rolls with a -1 modifier

                results.append(sum(pre_result))

                roll = ','.join(
                    [
                        '!' + str(i) if i == type_of_dice else str(i)
                        for i in result[:first_num]
                    ]
                )  # Add the first numbers, without the -1 but with a ! when roll is penetration
                roll += (
                    ',' if len(pre_result) > first_num else ''
                )  # Only add the comma in between if there's at least one penetration
                roll += ','.join([
                    ('!' + str(i) + '-1' if i == type_of_dice else str(i) +
                     '-1') for i in result[first_num:]
                ])  # Add the penetration dice with the '-1' tacked on the end
                string.append('[%s]' % roll)

            elif specific_penetrate is not None:  # Handle penetrating dice without a comparison modifier.
                type_of_dice = int(specific_penetrate[3])

                first_num = int(specific_penetrate[2])

                comparator = int(specific_penetrate[4])

                assert 0 < comparator <= type_of_dice

                result = []
                last_result = roll_group(specific_penetrate[1])
                result.extend(last_result)
                number_to_roll = num_equal(last_result, '=', comparator)
                while number_to_roll != 0:
                    last_result = roll_group(
                        str(number_to_roll) + 'd' + str(type_of_dice))
                    result.extend(last_result)
                    number_to_roll = num_equal(last_result, '=', comparator)

                pre_result = result[:first_num]  # Same as normal penetration
                pre_result.extend([x - 1 for x in result[first_num:]])

                results.append(sum(pre_result))

                roll = ','.join([
                    '!' + str(i) if i == comparator else str(i)
                    for i in result[:first_num]
                ])  # Same as above
                roll += (',' if len(pre_result) > first_num else '')
                roll += ','.join([
                    ('!' + str(i) + '-1' if i == comparator else str(i) + '-1')
                    for i in result[first_num:]
                ])
                string.append('[%s]' % roll)

            elif comparison_penetrate is not None:  # Handle penetrating dice without a comparison modifier.
                type_of_dice = int(comparison_penetrate[3])

                comparator = int(comparison_penetrate[5])

                first_num = int(comparison_penetrate[2])

                if comparison_penetrate[
                        4] == '>':  # Ensure comparison is within bounds
                    assert 0 < comparator < type_of_dice
                else:
                    assert 1 < comparator <= type_of_dice

                result = []
                last_result = roll_group(comparison_penetrate[1])
                result.extend(last_result)

                # Do penetration based on more than or less than sign.
                if comparison_penetrate[4] == '>':
                    number_to_roll = num_equal(last_result, '>', comparator)
                    while number_to_roll != 0:
                        last_result = roll_group(
                            str(number_to_roll) + 'd' + str(type_of_dice))
                        result.extend(last_result)
                        number_to_roll = num_equal(last_result, '>',
                                                   comparator)

                else:
                    number_to_roll = num_equal(last_result, '<', comparator)
                    while number_to_roll != 0:
                        last_result = roll_group(
                            str(number_to_roll) + 'd' + str(type_of_dice))
                        result.extend(last_result)
                        number_to_roll = num_equal(last_result, '<',
                                                   comparator)

                pre_result = result[:first_num]
                pre_result.extend([x - 1 for x in result[first_num:]])
                results.append(sum(pre_result))

                if comparison_penetrate[4] == '>':
                    roll = ','.join([
                        '!' + str(i) if i > comparator else str(i)
                        for i in result[:first_num]
                    ])  # Same as above
                    roll += (',' if len(pre_result) > first_num else '')
                    roll += ','.join([
                        ('!' + str(i) + '-1' if i > comparator else str(i) +
                         '-1') for i in result[first_num:]
                    ])
                else:
                    roll = ','.join([
                        '!' + str(i) if i < comparator else str(i)
                        for i in result[:first_num]
                    ])  # Same as above
                    roll += (',' if len(pre_result) > first_num else '')
                    roll += ','.join([
                        ('!' + str(i) + '-1' if i < comparator else str(i) +
                         '-1') for i in result[first_num:]
                    ])
                string.append('[%s]' % roll)

            elif reroll is not None:  # Handle rerolling dice without a comparison modifier (ie. on 1)
                type_of_dice = int(reroll[3])

                result_strings = []
                roll_strings = []
                result = roll_group(reroll[1])
                repeat = True if reroll[
                    4] == 'R' else False  # Reroll just once or infinite number of times

                if repeat:  #Handle rerolling the dice and building a string of all the rerolled ones
                    for i in range(len(result)):
                        prev = [result[i]]
                        while result[i] == 1:
                            result[i] = random.randint(1, type_of_dice)
                            prev.append(result[i])

                        roll_strings.append([str(x) for x in prev])

                else:
                    for i in range(len(result)):
                        prev = [result[i]]
                        if result[i] == 1:
                            result[i] = random.randint(1, type_of_dice)
                            prev.append(result[i])

                        roll_strings.append([str(x) for x in prev])

                results.append(sum(result))
                for roll_string in roll_strings:
                    roll_string.reverse()
                    result_strings.append(
                        '%s' % roll_string[0] +
                        ('~' if len(roll_string) > 1 else '') +
                        '~'.join(roll_string[1:]))  #Build the string

                roll = ','.join(result_strings)
                string.append('[%s]' % roll)

            elif specific_reroll is not None:  # Handle rerolling dice on a specific number, see reroll
                type_of_dice = int(specific_reroll[3])
                comparator = int(specific_reroll[5])

                assert 0 < comparator <= type_of_dice  # Ensure comparison is within bounds

                result_strings = []
                roll_strings = []
                result = roll_group(specific_reroll[1])
                repeat = True if specific_reroll[4] == 'R' else False

                if repeat:
                    for i in range(len(result)):
                        prev = [result[i]]
                        while result[i] == comparator:
                            result[i] = random.randint(1, type_of_dice)
                            prev.append(result[i])

                        roll_strings.append([str(x) for x in prev])

                else:
                    for i in range(len(result)):
                        prev = [result[i]]
                        if result[i] == comparator:
                            result[i] = random.randint(1, type_of_dice)
                            prev.append(result[i])

                        roll_strings.append([str(x) for x in prev])

                results.append(sum(result))
                for roll_string in roll_strings:
                    roll_string.reverse()
                    result_strings.append('%s' % roll_string[0] + (
                        '~' if len(roll_string) > 1 else '') +
                                          '~'.join(roll_string[1:]))

                roll = ','.join(result_strings)
                string.append('[%s]' % roll)

            elif comparison_reroll is not None:  # Handle rerolling dice with a comparison modifier.
                type_of_dice = int(comparison_reroll[3])
                comparator = int(comparison_reroll[6])

                if comparison_reroll[
                        5] == '>':  # Ensure comparison is within bounds
                    assert 0 < comparator < type_of_dice
                else:
                    assert 1 < comparator <= type_of_dice

                result_strings = []
                roll_strings = []
                result = roll_group(comparison_reroll[1])
                repeat = True if comparison_reroll[4] == 'R' else False
                if comparison_reroll[5] == '>':
                    if repeat:
                        for i in range(len(result)):
                            prev = [result[i]]
                            while result[i] > comparator:
                                result[i] = random.randint(1, type_of_dice)
                                prev.append(result[i])

                            roll_strings.append([str(x) for x in prev])

                    else:
                        for i in range(len(result)):
                            prev = [result[i]]
                            if result[i] > comparator:
                                result[i] = random.randint(1, type_of_dice)
                                prev.append(result[i])

                            roll_strings.append([str(x) for x in prev])
                else:
                    if repeat:
                        for i in range(len(result)):
                            prev = [result[i]]
                            while result[i] < comparator:
                                result[i] = random.randint(1, type_of_dice)
                                prev.append(result[i])

                            roll_strings.append([str(x) for x in prev])

                    else:
                        for i in range(len(result)):
                            prev = [result[i]]
                            if result[i] < comparator:
                                result[i] = random.randint(1, type_of_dice)
                                prev.append(result[i])

                            roll_strings.append([str(x) for x in prev])

                results.append(sum(result))
                for roll_string in roll_strings:
                    roll_string.reverse()
                    result_strings.append('%s' % roll_string[0] + (
                        '~' if len(roll_string) > 1 else '') +
                                          '~'.join(roll_string[1:]))

                roll = ','.join(result_strings)
                string.append('[%s]' % roll)

            elif success_comparison is not None:
                group_result = roll_group(success_comparison[1])
                result = []
                result_string = []

                type_of_dice = int(success_comparison[2])

                comparator = int(success_comparison[4])

                if success_comparison[
                        3] == '>':  # Ensure comparison is within bounds
                    assert 0 < comparator < type_of_dice
                else:
                    assert 1 < comparator <= type_of_dice

                for die in group_result:
                    if success_comparison[3] == '>':
                        result.append(1 if die > comparator else 0)
                        result_string.append(
                            '!' + str(die) if die > comparator else str(die))
                    else:
                        result.append(1 if die < comparator else 0)
                        result_string.append(
                            '!' + str(die) if die < comparator else str(die))

                results.append(sum(result))
                roll = ','.join(
                    result_string
                )  # Craft the string, adding an exclamation mark before every string that passed the comparison.
                string.append('[%s]' % roll)

            elif success_fail_comparison is not None:
                group_result = roll_group(success_fail_comparison[1])

                result = []
                result_string = []

                type_of_dice = int(success_fail_comparison[2])
                success_comp = int(success_fail_comparison[5])
                fail_comp = int(success_fail_comparison[7])

                # Ensure both comparisons are within bounds
                if success_fail_comparison[4] == '>':
                    assert 0 < success_comp < type_of_dice
                    assert 1 < fail_comp <= type_of_dice
                else:
                    assert 1 < success_comp <= type_of_dice
                    assert 0 < fail_comp < type_of_dice

                for die in group_result:
                    if success_fail_comparison[
                            4] == '>':  # Get the actual list of successes and fails with both comparisons
                        if die > success_comp:
                            result.append(1)
                            result_string.append('!' + str(die))
                        elif die < fail_comp:
                            result.append(-1)
                            result_string.append('*' + str(die))
                        else:
                            result.append(0)
                            result_string.append(str(die))
                    else:
                        if die < success_comp:
                            result.append(1)
                            result_string.append('!' + str(die))
                        elif die > fail_comp:
                            result.append(-1)
                            result_string.append('*' + str(die))
                        else:
                            result.append(0)
                            result_string.append(str(die))

                results.append(sum(result))  #
                roll = ','.join(result_string)
                string.append('[%s]' % roll)

            elif keep is not None:  # Handle rolling dice and keeping the x highest or lowest values
                group_result = roll_group(keep[1])
                group_result.sort(
                    reverse=True if keep[2] == 'K' else False
                )  # Uppercase is keep highest and lowercase is keep lowest.

                num_to_keep = int(keep[3] if keep[3] != '' else 1)
                assert 1 <= num_to_keep < len(group_result)

                results.append(sum(group_result[:num_to_keep]))
                roll = ','.join(
                    [str(i) for i in group_result[:num_to_keep]]
                ) + ' ~~ '  # This time format the string with all kept rolls on the left and dropped rolls on the right
                roll += ','.join([str(i) for i in group_result[num_to_keep:]])
                string.append('[%s]' % roll)

            elif drop is not None:
                group_result = roll_group(drop[1])
                group_result.sort(reverse=True if drop[2] == 'X' else
                                  False)  # Same thing as keep dice

                num_to_drop = int(drop[3] if drop[3] != '' else 1)
                assert 1 <= num_to_drop < len(group_result)

                results.append(sum(group_result[:num_to_drop]))
                roll = ','.join([str(i) for i in group_result[num_to_drop:]
                                 ]) + ' ~~ '  # Same as above.
                roll += ','.join([str(i) for i in group_result[:num_to_drop]])
                string.append('[%s]' % roll)

            elif individual is not None:
                group_result = roll_group(individual[1])
                result = []
                for i, j in enumerate(group_result):  #add to each roll
                    if individual[4] == 'a':
                        result.append(j + int(individual[5]))

                    elif individual[4] == 's':
                        result.append(j - int(individual[5]))

                    elif individual[4] == 'm':
                        result.append(j * int(individual[5]))

                    else:
                        raise ValueError
                results.append(sum(result))
                roll = ','.join([
                    str(x) + individual[4] + individual[5]
                    for x in group_result
                ])  #Create string with the modifier on each roll
                string.append('[%s]' % roll)

            elif normal is not None:
                group_result = roll_group(group)
                results.append(sum(group_result))
                roll = ','.join([str(i) for i in group_result])
                string.append('[%s]' % roll)

            elif literal is not None:
                results.append(int(
                    literal[1]))  # Just append the integer value
                string.append(literal[1])

            elif float_literal is not None:
                if floats:
                    results.append(float(group))
                    string.append(group)
                else:
                    raise TypeError
            else:
                raise Exception

        except Exception:
            raise DiceGroupException('"%s" is not a valid dicegroup.' % group)

    parser = SimpleEval(
        floats=floats, functions=functions
    )  #The parser object parses the dice rolls and functions
    try:
        final_result = parser.eval(''.join(
            [str(x)
             for x in results]))  #Call the parser to parse into one value
        if not floats:
            final_result = int(final_result)
    except Exception:
        raise DiceOperatorException('Error parsing operators and or functions')

    #Create explanation string and remove extraneous spaces
    explanation = ''.join(string)
    explanation = zero_width_split(
        r"""((?<=[\/%^+])(?![\/,]))| # Split between /, %, ^, and +
                                    ((?<![\/,])(?=[\/%^+]))| # Same as above
                                    ((?<=[^(])(?=-))(?!-[^[]*])| # Split in front of - that are not in a roll
                                    (?<=-)(?=[^\d()a-z])| # Same for splitting after - and before non-literals
                                    (?<=[\d)\]]-)(?=.)(?![^[]*])| # Split after a - that is not in a roll
                                    (?<=,)(?![^[]*])| # Split after a comma that is not in a roll
                                    (?<=([^,]\*))(?!\*)| # Split after a * that is not in a roll
                                    (?<![,\*])(?=\*) # Split before a * that is not in a roll""",
        explanation)  #Split on ops to properly format the explanation
    explanation = ' '.join(explanation)
    explanation = explanation.strip()
    explanation = regex.sub(r'[ \t]{2,}', ' ', explanation)

    return final_result, explanation
Exemplo n.º 52
0
import os
import json

import regex as re


INPUT_DIR = '/var/judgments/'
OUTPUT_DIR = './data/'
SOFT_LIMIT = 1147483648


gathered = 0
files = (file for file in os.listdir(INPUT_DIR)
         if re.match(r'judgments-\d+.json', file))

for file in files:
    with open(os.path.join(INPUT_DIR, file)) as f:
        judgments = json.load(f)

    file_name, file_ext = os.path.splitext(file)
    for judgment in judgments['items']:
        content = re.sub(r'<[^>]*>|-?\n', '', judgment['textContent'],
                         flags=re.WORD)

        with open(os.path.join(OUTPUT_DIR, f'{file_name}.{judgment["id"]}.txt'), 'w') as f2:
            f2.write(content)

        gathered += len(content)

    if gathered >= SOFT_LIMIT:
        break
Exemplo n.º 53
0
def _generalized_check(script: str, word: str) -> bool:
    prop = ("Block"
            if script == "Katakana" or script == "Hiragana" else "Script")
    regex_string = rf"^[\p{{{prop}={script}}}']+$"
    return bool(regex.match(regex_string, word))
Exemplo n.º 54
0
    def _inic_dic_vars(símismo):

        # Borrar lo que podría haber allí desde antes.
        símismo.variables.clear()

        cuerpo = símismo.dic_doc['cuerpo']

        l_tx_vars = []
        nuevo_var = True
        for fl in cuerpo:
            f = fl.strip().rstrip('\\')
            if len(f):
                if nuevo_var:
                    l_tx_vars.append(f)
                else:
                    l_tx_vars[-1] += f

                nuevo_var = (f[-1] == '|')

        for tx_var in l_tx_vars:
            tx_ec, tx_unids_líms, tx_info = tx_var.strip('|').split('~')
            obj_ec = Ecuación(tx_ec, dialecto='vensim')
            if obj_ec.tipo == 'sub':
                continue
            var = obj_ec.nombre
            try:
                tx_unids, tx_líms = tx_unids_líms.split('[')
            except ValueError:
                tx_unids = tx_unids_líms
                tx_líms = ''

            if len(tx_líms):
                líms = tuple([
                    float(x) if x.strip() != '?' else None
                    for x in tx_líms.strip(']').split(',')
                ][:2])
            else:
                líms = (None, None)

            símismo.variables[var] = {
                'val': None,
                'unidades': tx_unids.strip(),
                'ec': str(obj_ec),
                'ingreso': None,
                'dims': (1, ),  # Para hacer
                'líms': líms,
                'subscriptos': None,  # Para hacer
                'hijos': [],
                'parientes': obj_ec.variables(),
                'egreso': None,
                'info': tx_info.strip(),
                'val_inic': False
            }

        for v, d_v in símismo.variables.items():
            for p in d_v['parientes']:
                d_p = símismo.variables[p]
                d_p['hijos'].append(v)

        # Borrar lo que había antes en las listas siguientes:
        símismo.flujos.clear()
        símismo.auxiliares.clear()
        símismo.constantes.clear()
        símismo.niveles.clear()

        # Guardar una lista de los nombres de variables de tipo "nivel"
        símismo.niveles += [
            x for x, d in símismo.variables.items()
            if regex.match(r'INTEG *\(', d['ec'])
        ]

        # Los flujos, por definición, son los parientes de los niveles.
        for niv in símismo.niveles:

            # El primer argumento de la función INTEG de VENSIM
            ec = Ecuación(símismo.variables[niv]['ec'], dialecto='vensim')
            arg_integ = ec.sacar_args_func('INTEG', i=1)[0]
            args_inic = ec.sacar_args_func('INTEG')[1]
            if args_inic in símismo.variables:
                símismo.variables[args_inic]['val_inic'] = True

            # Extraer los variables flujos
            flujos = [
                v for v in Ecuación(arg_integ, dialecto='vensim').variables()
                if v not in símismo.internos
            ]

            for flujo in flujos:
                # Para cada nivel en el modelo...

                if flujo not in símismo.flujos and flujo not in símismo.niveles:
                    # Agregar el flujo, si no está ya en la lista de flujos.

                    símismo.flujos.append(flujo)

        # Los auxiliares son los variables con parientes que son ni niveles, ni flujos.
        símismo.auxiliares += [
            x for x, d in símismo.variables.items() if x not in símismo.niveles
            and x not in símismo.flujos and len(d['parientes'])
        ]

        # Los constantes son los variables que quedan.
        símismo.constantes += [
            x for x, d in símismo.variables.items()
            if not len(d['parientes']) and not any(h in símismo.flujos
                                                   for h in d['hijos'])
        ]
Exemplo n.º 55
0
 dirs.sort(key=nkey)
 if not any(
     fname.upper().endswith(".PDF") for fname in os.listdir(subdir)
 ):
     continue
 nrope = (
     os.path.basename(os.path.dirname(subdir))
     + "_"
     + os.path.basename(subdir)
 )
 print(str(countope) + "\t" + nrope)
 # print(str(ile) + "_" + str(countope) + "\t" + nrope)
 countope += 1
 for file in natsorted(files):
     if file.upper().endswith(".PDF") and regex.match(
         r"^.+(-SZK-|-M-|-Z-).+\.PDF", file.upper()
     ):
         plik = os.path.join(subdir, file)
         try:
             doc = fitz.open(plik)
             strony = doc.pageCount
             if not strony == 1:
                 with io.open(
                     plikwynik, "a", encoding="utf-8"
                 ) as wynik:
                     wynik.write(str(strony) + "\t" + plik + "\n")
                 continue
         except:
             with io.open(bledny, "a", encoding="utf-8") as bl:
                 bl.write(plik + "\n")
             continue
Exemplo n.º 56
0
Arquivo: play.py Projeto: 3ach/aoc
import regex
import sys

RULE_REGEX = r'^(?P<container>[a-z ]+) bags contain (((?P<contained>\d+ [a-z ]+) bags?(, |.))+|(?:no other bags.))$'

for line in sys.stdin.readlines():
    print(regex.match(RULE_REGEX, line.strip()).groupdict())
Exemplo n.º 57
0
def get_raw_dates(text,
                  strict=False,
                  base_date=None,
                  return_source=False,
                  locale=None) -> Generator:
    """
    Find "raw" or potential date matches prior to false positive classification.
    :param text: raw text to search
    :param strict: whether to return only complete or strict matches
    :param base_date: base date to use for implied or partial matches
    :param return_source: whether to return raw text around date
    :param locale: locale object
    :return:
    """
    # Setup base date
    if not base_date:
        base_date = datetime.datetime.now().replace(day=1,
                                                    month=1,
                                                    hour=0,
                                                    minute=0,
                                                    second=0,
                                                    microsecond=0)

    # Find potential dates
    date_finder = DateFinder(base_date=base_date)

    for extra_token in date_finder.EXTRA_TOKENS_PATTERN.split('|'):
        if extra_token != 't':
            date_finder.REPLACEMENTS[extra_token] = ' '

    # Iterate through possible matches
    possible_dates = list(date_finder.extract_date_strings(text,
                                                           strict=strict))
    possible_matched = []

    for i, possible_date in enumerate(possible_dates):
        # Get
        date_string = possible_date[0]
        index = possible_date[1]
        date_props = possible_date[2]

        # Cleanup "day of" strings
        if "of" in date_props["extra_tokens"] or "OF" in date_props[
                "extra_tokens"]:
            num_dig_mod = len(possible_dates[i - 1][2]["digits_modifier"])
            if i > 0 and not possible_matched[i - 1] and num_dig_mod == 1:
                date_props["digits_modifier"].extend(
                    possible_dates[i - 1][2]["digits_modifier"])
                date_string = possible_dates[i - 1][2]["digits_modifier"].pop() \
                                  .replace("st", "") \
                                  .replace("nd", "") \
                                  .replace("rd", "") \
                                  .replace("th", "") + date_string

        # Skip only digits modifiers
        num_dig_mod = len(date_props["digits_modifier"])
        num_dig = len(date_props["digits"])
        num_days = len(date_props["days"])
        num_month = len(date_props["months"])
        num_slash = date_props["delimiters"].count("/")
        num_point = date_props["delimiters"].count(".")
        num_hyphen = date_props["delimiters"].count("-")

        # Remove double months
        if num_month > 1:
            possible_matched.append(False)
            continue

        # Remove wrong months like Dec*ided or Mar*tin
        if num_month == 1 and date_props['extra_tokens'] \
                and (date_props['months'][0] + date_props['extra_tokens'][-1]) in date_string:
            possible_matched.append(False)
            continue

        # Check strange strings
        if num_dig_mod > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOW only
        if num_days > 0 and num_dig == 0:
            possible_matched.append(False)
            continue

        # Skip DOM only
        if num_month == 0 and num_dig_mod == 0 and num_dig <= 1:
            possible_matched.append(False)
            continue

        # Skip odd date like "1 10"
        if re.match(r'\d{1,2}\s+\d{1,2}', date_string):
            possible_matched.append(False)
            continue

        # Skip floats
        if num_point and not num_month and not re.match(
                r'\d{2}\.\d{2}\.\d{2,4}', date_string):
            possible_matched.append(False)
            continue

        # Skip odd months from string like "Nil 62. Marquee"
        if re.search(r'\d{2,4}\.\s*[A-Za-z]', date_string):
            possible_matched.append(False)
            continue

        # Skip fractions
        if (num_slash == 1 or num_hyphen == 1) and num_dig > 2:
            possible_matched.append(False)
            continue

        # Skip three-digit blocks and double zero years
        found_triple = False
        found_dz = False
        for digit in date_props["digits"]:
            if len(digit) == 3:
                found_triple = True
            if digit.startswith("00"):
                found_dz = True
        if found_triple or found_dz:
            possible_matched.append(False)
            continue

        # Skip "may" alone
        if num_dig == 0 and num_days == 0 and "".join(
                date_props["months"]).lower() == "may":
            possible_matched.append(False)
            continue

        # Skip cases like "13.2 may" or "12.12may"
        if (num_dig > 0 and (num_point + num_slash + num_hyphen) > 0
                and "".join(date_props["months"]).lower() == "may"):
            possible_matched.append(False)
            continue

        # Cleanup
        for token in sorted(date_props["extra_tokens"], key=len, reverse=True):
            if token.lower() in ["to", "t"]:
                continue
            date_string = date_string.replace(token, "")
        date_string = date_string.strip()
        date_props["extra_tokens"] = []

        # Skip strings too long
        if len(date_string) > DATE_MAX_LENGTH:
            possible_matched.append(False)
            continue

        # Skip numbers only
        match_delims = set("".join(date_props["delimiters"]))
        bad_delims = {",", " ", "\n", "\t"}
        len_diff_set = len(match_delims - bad_delims)
        if len_diff_set == 0 and num_month == 0:
            possible_matched.append(False)
            continue

        # Parse and skip nones
        date = None
        try:
            date_string_tokens = date_string.split()
            for cutter in range(len(date_string_tokens)):
                for direction in (0, 1):
                    if cutter > 0:
                        if direction:
                            _date_string_tokens = date_string_tokens[cutter:]
                        else:
                            _date_string_tokens = date_string_tokens[:-cutter]
                        date_string = ' '.join(_date_string_tokens)
                    try:
                        date = date_finder.parse_date_string(date_string,
                                                             date_props,
                                                             locale=locale)
                    # pylint: disable=broad-except
                    except:
                        date = None
                    if date:
                        break
                else:
                    continue  # executed if the loop ended normally (no break)
                break  # executed if 'continue' was skipped (break)
        except TypeError:
            possible_matched.append(False)
            continue

        if date and not check_date_parts_are_in_date(date, date_props):
            date = None

        if not date:
            possible_matched.append(False)
            continue
        # for case when datetime.datetime(2001, 1, 22, 20, 1, tzinfo=tzoffset(None, -104400))
        if hasattr(date, 'tzinfo'):
            try:
                _ = date.isoformat()
            except ValueError:
                possible_matched.append(False)
                continue
        possible_matched.append(True)

        if isinstance(
                date,
                datetime.datetime) and date.hour == 0 and date.minute == 0:
            date = date.date()
        # Append
        if return_source:
            yield (date, index)
        else:
            yield date
Exemplo n.º 58
0
# Printing Header on the Screen
print(
    "-----------------------Muhammad Luqman------------------G00353385----------------Graph Theory Project------------------------"
)

# prompt user to enter infix notation to match
u_infix = input("Please enter an infix :")

# validation if user leaves infix empty
while u_infix == "":
    print("Cannot be Empty!")
    # prompt user to enter infix notation to match
    u_infix = input("Please enter an infix :")

# Prompt user to enter the string to match
string = input("please enter a string :")

# validation if user leaves string empty
while string == "":
    print("Cannot be Empty!")
    # Prompt user to enter the string to match
    string = input("please enter a string :")

# if the result is True then it prints the following statement
if (regex.match(u_infix, string)) == True:
    print("The inputs Matched!")

# if the result is False then it prints the following statement
elif (regex.match(u_infix, string)) == False:
    print("The inputs not Matched")
Exemplo n.º 59
0
    def segment_inner(self, e):

        msgstr_mapping = {}

        def sanitize_key(key):
            return regex.sub(r'[\p{punct}\s]', '', key).casefold()

        msgstr_elements = e.cssselect('[data-msgstr]')
        for msgstr_element in msgstr_elements:
            msgstr_mapping[sanitize_key(msgstr_element.text_content())] = {
                'value': msgstr_element.get('data-msgstr'),
                'used': False
            }
        for msgstr_element in msgstr_elements:
            msgstr_element.drop_tag()

        variant_notes = {}
        var_elements = e.cssselect('.var')
        for var_element in var_elements:
            variant_notes[var_element.text_content()] = 'VAR: {} → {}'.format(
                var_element.text_content(), var_element.get('title'))
        for var_element in var_elements:
            var_element.drop_tag()

        html_string = lxml.html.tostring(e, encoding='unicode').strip()
        html_string = html_string.replace('\n', ' ').replace('\xa0',
                                                             ' ').replace(
                                                                 '\xad', '')
        m = regex.match(r'<[^<]+>[ \n\t]*(.*)</\w+>',
                        html_string,
                        flags=regex.DOTALL)
        if not m:
            raise ValueError(html_string)

        html_string = m[1]
        m = regex.match(r'(?i)((?:<a[^<]*></a>[ \n\t]*)*)(.*)', html_string)
        if m[1]:
            self.add_token(TokenType.comment, m[1])
            html_string = m[2]
        html_string = self.mangle(html_string)
        logger.info(html_string)
        pattern = r'(?<!\d+)([.;:!?—,。:;!…?—](?:\p{punct}+|[ \n\t]*MANG.[0-9]+GLE[\p{punct}\d]*MANG.[0-9]+GLE)*[\u200b\s]*|(?<!^)…[ \n\t]*(?:pe[ \n\t]*…[ \n\t]*)?[.;:!?—;:。,,。:;!…?—]*)(?:MANGR[0-9]+GLE)*'
        parts = regex.split(pattern, html_string)
        segments = [
            ''.join(parts[i:i + 2]).strip() for i in range(0, len(parts), 2)
        ]

        for i, segment in list(enumerate(segments)):
            m = regex.match(r'(?r)[「「『]$', segment)
            if m:
                print(segments[i], segments[i + 1],
                      segment[-1] + segments[i + 1])
                segments[i + 1] = segment[-1] + segments[i + 1]
                segments[i] = segment[:-1]
        sentence_count = 0
        for segment in segments:
            if not segment:
                continue
            segment = self.demangle(segment)
            lines = regex.split('(<br[^>]*>|(?:<a [^>]+></a>)*$)', segment)
            for i in range(0, len(lines), 2):
                line = lines[i].strip()
                if line:
                    m = regex.match(r'^[ \n\t]*(</\w+>)(.*)',
                                    line,
                                    flags=regex.DOTALL)
                    if m:
                        if self.token_stream[
                                -1].type == TokenType.newline and self.token_stream[
                                    -2].type == TokenType.text:
                            self.token_stream[-2].value += m[1]
                            line = m[2].strip()
                    sentence_count += 1
                    ctxt = '{}:{}.{}'.format(self.uid, self.paragraph_count,
                                             sentence_count)
                    msgstr = ''

                    for var_text in list(variant_notes):
                        if var_text in line:
                            self.add_token(TokenType.comment_note,
                                           variant_notes.pop(var_text))
                    if line and not line.isspace():
                        try:
                            key = sanitize_key(
                                lxml.html.fromstring(line).text_content())
                        except Exception as e:
                            globals().update(locals())
                            raise

                        if key in msgstr_mapping:
                            msgstr_mapping[key]['used'] = True
                            msgstr = msgstr_mapping[key]['value']
                        self.add_token(TokenType.text, line, ctxt, msgstr)

                if i + 1 < len(lines):
                    br = lines[i + 1].strip()
                    if br:
                        self.add_token(TokenType.comment, br)
                self.add_token(TokenType.newline)

        for key, obj in msgstr_mapping.items():
            if obj['used'] == False:
                print('Failed to find use for {}: {}'.format(
                    key, obj['value']))
Exemplo n.º 60
0
    def iterate_map_uri(self):
        """
        Expand step templates for each map-reduce item.

        Items must match the map-reduce regex to be included,
        and are stored in the self._map list.
        If no map_uri is given, only one item "." is included in _map.

        Args:
            self: class instance.

        Returns:
            On success: True.
            On failure: False.

        """
        def multiple_replace(string, rep_dict):
            """
            Replace multiple string patterns simultaneously.

            Args:
                string: The string to be replaced.
                rep_dict: Dictionary containing key and values as patterns that
                    should be replaced.

            Returns:
                On success: The string with all the patterns replaced.
                On failure: False.

            """
            pattern = re.compile(
                "|".join([re.escape(k) for k in rep_dict.keys()]), re.M)
            return pattern.sub(lambda x: rep_dict[x.group(0)], string)

        # iterate map items
        if self._map_uris == []:
            # no mapping, run only one job
            self._map = [{
                'filename': 'root',
                'chopped_uri': '',
                'replace': {},
                'template': {},
                'status': 'PENDING',
                'attempt': 0,
                'run': [{}]
            }]

        else:
            # list uri contents and place into matched files
            file_list = self._get_map_uri_list()
            if file_list is False:
                msg = 'cannot get list of items from map uris: {}'.format(
                    self._map_uris)
                Log.an().error(msg)
                return self._fatal(msg)

            if file_list == []:  # this folder should never be empty
                msg = 'map uri contents cannot be empty: {}'.format(
                    self._map_uris)
                Log.an().error(msg)
                return self._fatal(msg)

            for f in file_list:
                # check if file matches regex
                match = re.match(self._step['map']['regex'], f['filename'])
                if match:
                    groups = list(match.groups())
                    replace = {}
                    for i, group in enumerate(groups):
                        replace[str('${' + str(i + 1) + '}')] = str(group)
                    self._map.append({
                        'filename': f['filename'],
                        'chopped_uri': f['chopped_uri'],
                        'replace': replace,
                        'template': {},
                        'status': 'PENDING',
                        'attempt': 0,
                        'run': [{}]
                    })

            if not self._map:
                msg = ('map uri contents must include at least'
                       ' one item matching regex: {}').format(self._map_uris)
                Log.an().error(msg)
                return self._fatal(msg)

        # iterate through items, expand templates
        for map_item in self._map:
            replace = map_item['replace'].copy()
            replace.update(self._replace)
            ##### replace map uri base with value corresponding to map item
            replace[self._step['map']['uri']] = map_item['chopped_uri']
            for template_key in self._step['template']:
                if isinstance(self._step['template'][template_key], str):
                    map_item['template'][template_key] = multiple_replace(
                        self._step['template'][template_key], replace)
                else:
                    map_item['template'][template_key]\
                        = self._step['template'][template_key]

        return True