Пример #1
0
def download(seed):
    url = geturl()
    if (url == None):
        return None
    print(url)
    locs = getlocs(url[0])
    number = len(locs)
    cname = url[1].split('/')[0].strip()
    c1name = rmv(cname)
    with open('companies.csv', 'a') as f:
        f.write(c1name.strip() + ',' + url[2] + ',' + url[3] + ',' + url[4] +
                ',' + locs[0] + ',')
        nms = []
        for r in locs[1:]:
            flgname = rmv(r[0])

            name = HumanName(flgname)
            if (name.first == '' or name.last == ''):
                continue

            nms.append(flgname)
        nms = ','.join(nms)
        f.write(nms + '\n')
    try:
        loop = asyncio.get_event_loop()
        future = asyncio.ensure_future(run(number - 1, locs[1:], seed))
        rows = loop.run_until_complete(future)
    except:
        loop.stop()
        print("retrying " + url[0])
        return url

    with open('emails.csv', 'a') as f:
        for row in rows:
            if (row == []):
                continue
            if (url[1] in row[1]):
                row[1] = row[1].replace(url[1], '').strip()
                row[1] = row[1][:row[1].rfind(' ')]
            elif (cname in row[1]):
                row[1] = row[1].replace(cname, '').strip()
                row[1] = row[1][:row[1].rfind(' ')]
            row[1] = row[1].split(',')[0]
            row[1] = row[1].split('-')[0]
            row[1] = rmv(row[1])
            row[1] = row[1].replace(" De ", ' ').replace(" And ", ' ').replace(
                " In ", ' ').replace(" For ",
                                     ' ').replace(" Of ",
                                                  ' ').replace(" En ", ' ')
            if ('.html' in row[-1]):
                f.write(
                    rmv(row[0]).strip() + ',' + row[1].strip() + ',' +
                    c1name.strip() + '\n')
                continue
            else:
                f.write(
                    rmv(row[0]).strip() + ',' + row[1].strip() + ',' +
                    c1name.strip())
            dups = []

            for r in row[3:-1]:
                if (test_email(r.strip()) and r.strip() not in dups):
                    f.write(',' + r.strip())
                    dups.append(r.strip())
            if (test_email(row[-1].strip()) and row[-1].strip() not in dups):
                f.write(row[-1].strip() + '\n')
            else:
                f.write('\n')

    update()
    return url
Пример #2
0
def parseNames(fullname):
    return HumanName(fullname)
 def extract_title(self):
     self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
Пример #4
0
 def extract_last_name(self):
     "Extracts last name from name feature using nameparser."
     self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
Пример #5
0
 def clean_last_name(self):
     Hname = HumanName(' '.join([self.first_name, self.last_name]))
     Hname.capitalize()
     self.last_name = Hname.last
Пример #6
0
 def _generate_lastName(self):
     if "name" in self.parse_data.keys():
         if self.parse_data["name"]:
             self.lastName = HumanName(self.parse_data["name"]).last
Пример #7
0
def split_name(name):
    n = HumanName(name)
    return pd.Series([n.first, n.last])
Пример #8
0
    def parse_rss(message):
        """
            Parse Feeds into the CMS Module
        """

        db = current.db
        s3db = current.s3db
        table = s3db.msg_rss
        record = db(table.message_id == message.message_id).select(table.channel_id,
                                                                   table.title,
                                                                   table.from_address,
                                                                   table.body,
                                                                   table.date,
                                                                   table.location_id,
                                                                   table.tags,
                                                                   table.author,
                                                                   limitby=(0, 1)
                                                                   ).first()
        if not record:
            return

        post_table = s3db.cms_post

        # Is this an Update or a Create?
        body = record.body or record.title
        url = record.from_address
        if url:
            doc_table = s3db.doc_document
            exists = db(doc_table.url == url).select(doc_table.doc_id,
                                                     limitby=(0, 1)
                                                     ).first()
            if exists:
                exists = db(post_table.doc_id == exists.doc_id).select(post_table.id,
                                                                       limitby=(0, 1)
                                                                       ).first()
        else:
            # Use Body
            exists = db(post_table.body == body).select(post_table.id,
                                                        limitby=(0, 1)
                                                        ).first()


        channel_id = record.channel_id
        tags = record.tags

        author = record.author
        if author:
            ptable = s3db.pr_person
            # https://code.google.com/p/python-nameparser/
            from nameparser import HumanName
            name = HumanName(author)
            first_name = name.first
            middle_name = name.middle
            last_name = name.last
            query = (ptable.first_name == first_name) & \
                    (ptable.middle_name == middle_name) & \
                    (ptable.last_name == last_name)
            pexists = db(query).select(ptable.id,
                                       limitby=(0, 1)
                                       ).first()
            if pexists:
                person_id = pexists.id
            else:
                person_id = ptable.insert(first_name = first_name,
                                          middle_name = middle_name,
                                          last_name = last_name)
                s3db.update_super(ptable, dict(id=person_id))
        else:
            person_id = None

        if exists:
            post_id = exists.id
            db(post_table.id == post_id).update(title = record.title,
                                                body = body,
                                                date = record.date,
                                                location_id = record.location_id,
                                                person_id = person_id,
                                                )
            # Read existing Tags (which came from remote)
            ttable = db.cms_tag
            ltable = db.cms_tag_post
            query = (ltable.post_id == post_id) & \
                    (ltable.mci == 1) & \
                    (ltable.tag_id == ttable.id)
            rows = db(query).select(ttable.name)
            # Compare these to tags in current version of post
            old_tags = [r.name for r in rows]
            new_tags = []
            delete_tags = []
            for tag in tags:
                if tag not in old_tags:
                    new_tags.append(tag)
            for tag in old_tags:
                if tag not in tags:
                    delete_tags.append(tag)
            if new_tags or delete_tags:
                lookup_tags = []
                lookup_tags.extend(new_tags)
                lookup_tags.extend(delete_tags)
                _tags = db(ttable.name.belongs(lookup_tags)).select(ttable.id,
                                                                    ttable.name,
                                                                    ).as_dict(key="name")
            for t in new_tags:
                tag = _tags.get(t, None)
                if tag:
                    tag_id = tag["id"]
                else:
                    tag_id = ttable.insert(name = t)
                ltable.insert(post_id = post_id,
                              tag_id = tag_id,
                              mci = 1, # This is an imported record, not added natively
                              )
            for t in delete_tags:
                tag = _tags.get(t, None)
                if tag:
                    query = (ltable.post_id == post_id) & \
                            (ltable.tag_id == tag["id"]) & \
                            (ltable.mci == 1) & \
                            (ltable.deleted == False)
                    db(query).delete()

        else:
            # Default to 'News' series
            table = db.cms_series
            series_id = db(table.name == "News").select(table.id,
                                                        cache=s3db.cache,
                                                        limitby=(0, 1)
                                                        ).first().id

            post_id = post_table.insert(title = record.title,
                                        body = body,
                                        date = record.date,
                                        location_id = record.location_id,
                                        person_id = person_id,
                                        series_id = series_id,
                                        mci = 1, # This is an imported record, not added natively
                                        )
            record = dict(id=post_id)
            s3db.update_super(post_table, record)

            # Source link
            if url:
                doc_table.insert(doc_id = record["doc_id"],
                                 url = url,
                                 )

            # Is this feed associated with an Org/Network?
            def lookup_pe(channel_id):
                ctable = s3db.msg_rss_channel
                channel_url = db(ctable.channel_id == channel_id).select(ctable.url,
                                                                         limitby=(0, 1)
                                                                         ).first().url
                ctable = s3db.pr_contact
                ptable = s3db.pr_pentity
                query = (ctable.contact_method == "RSS") & \
                        (ctable.value == channel_url) & \
                        (ctable.pe_id == ptable.pe_id)
                pe = db(query).select(ptable.pe_id,
                                      ptable.instance_type,
                                      limitby=(0, 1)
                                      ).first()
                if pe:
                    pe_type = pe.instance_type
                    otable = s3db[pe_type]
                    org_id = db(otable.pe_id == pe.pe_id).select(otable.id,
                                                                 limitby=(0, 1),
                                                                 ).first().id
                    return pe_type, org_id
                else:
                    return None, None

            pe_type, org_id = current.cache.ram("pe_channel_%s" % channel_id,
                                                lambda: lookup_pe(channel_id),
                                                time_expire=120
                                                )
            if pe_type == "org_organisation":
                s3db.cms_post_organisation.insert(post_id=post_id,
                                                  organisation_id=org_id,
                                                  )
            elif pe_type == "org_group":
                s3db.cms_post_organisation_group.insert(post_id=post_id,
                                                        group_id=org_id,
                                                        )


            if tags:
                ttable = db.cms_tag
                ltable = db.cms_tag_post
                _tags = db(ttable.name.belongs(tags)).select(ttable.id,
                                                             ttable.name,
                                                             ).as_dict(key="name")
                for t in tags:
                    tag = _tags.get(t, None)
                    if tag:
                        tag_id = tag["id"]
                    else:
                        tag_id = ttable.insert(name = t)
                    ltable.insert(post_id = post_id,
                                  tag_id = tag_id,
                                  mci = 1, # This is an imported record, not added natively
                                  )

        # No Reply
        return
Пример #9
0
    def populateObjectFromHTML(tree):
        proplist = []
        base_url = "https://www.museumfuernaturkunde.berlin"
        # scrapes the contact info section
        arguments = ['Name','Email', 'Telefon', 'Fax', 'Adresse']
        for info in arguments:
            try:
                proplist.append(tree.find('div', class_=("views-field views-field-"+info)).find('span', class_="field-content").get_text().replace("\r\n", ","))
            except AttributeError:
                proplist.append(None)

        #scrapes the photo URI
        proplist.append(base_url + tree.find('div', class_="views-field views-field-img-URL").span.img.get('src'))

        #scrapes the accordion
        accordion = {}
        # get all accordion entries
        for element in tree.find_all('section', class_="ui_segment_accordion"):
            titel = normalizeTitel(re.sub(r"[^\w .()]", "", element.find('h2', class_="ui_segment_accordion__head").get_text()).strip())
            # get all publications and parse them by <br/>'s
            if titel == "Publikationen":
                accordion[titel] = parseInformation(element, "ui_segment_accordion__content", 'list')
            # search in the "Forschung" entry for an "Forschungsprojekte" entry to extract it
            elif titel == "Forschung":
                research = [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p', 'h2'])]
                groupedResearch = [list(group) for k, group in groupby(research, lambda x: re.match(r".{0,5}(Forschungsprojekt|Projekt)e?:?$", x)) if not k]
                if len(groupedResearch) > 1:
                    if 'Forschungsprojekte' in accordion:
                        accordion['Forschungsprojekte'] += groupedResearch.pop(1)
                    else:
                        accordion['Forschungsprojekte'] = groupedResearch.pop(1)
                if len(groupedResearch) > 0:
                    accordion[titel] = [element for sublist in groupedResearch for element in sublist if element]
            # if nothing matches just get the text of the element
            else:
                if titel in accordion:
                    accordion[titel] += [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el]
                else:
                    accordion[titel] = [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el]
        proplist.append(accordion)

        #try to find additional informations
        try:
            for link in tree.find('div', class_="view-display-id-single_person_sidebar_view").findAll('a', href=True):
                print(link.text)
                titel = normalizeTitel(link.text.strip())
                if titel == 'Lebenslauf':
                    print("CV Gefunden")
                    infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml')
                    proplist[6]['CV'] = parseInformation(infoTree, "faqfield-answer", 'text')
                elif titel == 'Publikationen':
                    print("Publikation Gefunden")
                    infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml')
                    if titel in proplist[6]:
                        proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'list')
                    else:
                        proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'list')
                else:
                    print("Etwas anderes Gefunden")
                    infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml')
                    if titel in proplist[6]:
                        proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'text')
                    else:
                        proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'text')
        except AttributeError:
            print('No additional information was found')


        # delete extra headlines
        if 'Forschung' in proplist[6]:
            proplist[6]['Forschung'] = [val for val in proplist[6]['Forschung'] if not match(val)]
        if 'Publikationen' in proplist[6]:
            proplist[6]['Publikationen'] = [val for val in proplist[6]['Publikationen'] if not match(val)]



        # set up the name parser
        constants = Constants()
        constants.titles.add('PD', 'Dipl.', 'des.', 'Professor', 'M.Sc.', 'FH')
        # parse the name and normalize weird writing styles for "Ph.D."
        proplist[0] = HumanName(re.sub(r"Ph\. D\.", "Ph.D." ,proplist[0]), constants=constants).as_dict()
        return proplist
Пример #10
0
def index(name):
    try:
        return_name = HumanName(name)
        return jsonify(return_name.as_dict())
    except Exception as e:
        return jsonify(str(e))
Пример #11
0
def normalize_person_name(person_name):
    new_string = name_reg_ex.sub("", person_name)
    name = HumanName(new_string)
    name.capitalize(force=True)
    return str(name)
Пример #12
0
a_list = []
for i in range(1,261):
   url = "https://digitalcommons.calpoly.edu/csse_fac/" + str(i)
   try:
      myRequest = requests.get(url, verify=False)
      soup = BeautifulSoup(myRequest.text,"html.parser")

      for eventRow in soup.find_all('div',attrs={'id':'recommended_citation'}):
         for item in eventRow.find_all('p',attrs={'class':'comments'}):
            if item.find('em') is not None:
               for items in item.find('em'):
                   journal = items

      for eventRow in soup.find_all('p',attrs={'class':'author'}):
         for items in eventRow.find_all('strong'):
            name = HumanName(items.text)
            a_list.append((soup.head.title.get_text().split('by')[0].strip(),name.first,name.last,journal))
      print(a_list)

   except ConnectionError: 
      print('Whoops')

for item in a_list:
   print(item)

try:
   connection = pymysql.connect(
     host="localhost",
     user="******",
     passwd="****",
     database="jmabel466",
Пример #13
0
from nameparser import HumanName
with open("tempName", 'rb') as g:
    lines = g.readlines()
authorList = ""
for line in lines:
    authors = HumanName(line)
    authorList += authors.last + ", " + authors.first + " ; "
    print(authorList)
Пример #14
0
def normalize_name(name):
    """
    Normalize a name for sorting and indexing.

    This uses two powerful python libraries for differing reasons.

    `probablepeople` contains a discriminator between company and person names.
    This is used to determine whether to parse into last, first, middle or to
    leave the name alone.

    However, the actual name parser in `probablepeople` is unnecessarily complex,
    so that strings that it determines to be human names are parsed instead by
    the simpler `nameparser`.

    """
    sname = name.strip()  # remove leading and trailing spaces

    # Recognizer tends to mistake concatenated initials for Corporation name.
    # Pad potential initials with spaces before running recognizer
    # For any character A-Z followed by "." and another character A-Z, add a space after the first.
    # (?=[A-Z]) means to find A-Z after the match string but not match it.
    nname = re.sub("(?P<thing>[A-Z]\\.)(?=[A-Z])", "\\g<thing> ", sname)

    try:
        # probablepeople doesn't understand utf-8 encoding. Hand it pure unicode.
        _, type = probablepeople.tag(nname)  # discard parser result
    except probablepeople.RepeatedLabelError:  # if it can't understand the name, it's foreign
        type = 'Unknown'

    if type == 'Corporation':
        return sname  # do not parse and reorder company names

    # special case for capitalization: flag as corporation
    if (adjacent_caps.match(sname)):
        return sname

    # treat anything else as a human name
    nameparts = HumanName(nname)
    normalized = ""
    if nameparts.last:
        normalized = nameparts.last

    if nameparts.suffix:
        if not normalized:
            normalized = nameparts.suffix
        else:
            normalized = normalized + ' ' + nameparts.suffix

    if normalized:
        normalized = normalized + ','

    if nameparts.title:
        if not normalized:
            normalized = nameparts.title
        else:
            normalized = normalized + ' ' + nameparts.title

    if nameparts.first:
        if not normalized:
            normalized = nameparts.first
        else:
            normalized = normalized + ' ' + nameparts.first

    if nameparts.middle:
        if not normalized:
            normalized = nameparts.middle
        else:
            normalized = ' ' + normalized + ' ' + nameparts.middle

    return normalized.strip()
Пример #15
0
def match_by_name(name,
                  state=None,
                  office=None,
                  cycle=None,
                  reverse_name_order=False):
    result_array = []
    name1 = HumanName(name)

    name1_standardized = None
    blocking_name = None

    # sometimes we run into a name that's flipped:
    if reverse_name_order:
        print "Running name reversal check!"
        blocking_name = simple_clean(name1.first)
        name1_standardized = simple_clean(name1.first) + " " + unnickname(
            name1.last)

    else:
        name1_standardized = simple_clean(name1.last) + " " + unnickname(
            name1.first)
        blocking_name = simple_clean(name1.last)

    # if we can't find the last name, assume the name is the last name. This might be a bad idea.
    if not blocking_name:
        blocking_name = simple_clean(name)

    possible_matches = block_by_startswith(blocking_name,
                                           starts_with_blocklength, state,
                                           office, cycle)

    for match in possible_matches:

        name2_name = HumanName(match['cand_name'])
        name2 = simple_clean(name2_name.last) + " " + unnickname(
            name2_name.first)
        # calculate a buncha metrics
        text1 = name1_standardized
        text2 = name2
        #print "comparing '%s' to '%s'" % (text1, text2)
        ratio = 1 / 100.0 * fuzz.ratio(text1, text2)
        partial_ratio = 1 / 100.0 * fuzz.partial_ratio(text1, text2)
        token_sort_ratio = 1 / 100.0 * fuzz.token_sort_ratio(text1, text2)
        token_set_ratio = 1 / 100.0 * fuzz.token_set_ratio(text1, text2)

        avg_len = 1 / 2 * len(text1) + len(text2)
        min_len = min(len(text1), len(text2))

        l_ratio = 0
        try:
            l_distance = jellyfish.levenshtein_distance(text1, text2)
            l_ratio = 1.0 - ((0.0 + l_distance) / (0.0 + avg_len))
        except UnicodeEncodeError:
            pass

        long_match = longest_match(text1, text2)
        lng_ratio = (0.0 + long_match) / (0.0 + min_len)

        score = 0
        if (ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6
                or lng_ratio > 0.6):
            score = compute_scores([ratio, partial_ratio, l_ratio, lng_ratio])

        if debug:
            log.debug(
                "|fuzzymatchresult|%s|'%s'|'%s'|score=%s|ratio=%s|partial_ratio=%s|token_sort_ratio=%s|token_set_ratio=%s| l_ratio=%s|lng_ratio=%s"
                % (match['cand_id'], match['cand_name'], name, score, ratio,
                   partial_ratio, token_sort_ratio, token_set_ratio, l_ratio,
                   lng_ratio))

        if (score > 0.8):
            name_standardized = standardize_name_from_dict(match)
            result_array.append({
                'name': name_standardized,
                'id': match['cand_id'],
                'score': score,
                'type': [],
                'match': False
            })
            if debug:
                log.debug("Match found: %s" % name_standardized)

    if debug and len(result_array) == 0:
        log.debug("No match for %s, which was standardized to: %s" %
                  (name, name1_standardized))

    # If it's a good match and there's only one, call it a definite match.
    if (len(result_array) == 1):
        if result_array[0]['score'] > 0.9:
            result_array[0]['match'] = True
    # surprisingly, google refine *doesn't* sort by score.
    return result_array
print
"Now let's take care of those", train.Age.isnull().sum(), "null values"

# In[30]:

print
"One idea would be to take the median age:", train.Age.median(), "or mean:", train.Age.mean(), "but I think we can get a clue from people's titles (ex Mr., Mrs.)"

# First let's see what titles we have.

# In[31]:

titles = []
for name in train.Name:
    titles.append(HumanName(name).title)
print
set(titles)

# The titles look good, expect there's an empty string, perhaps that's for the less common titles, but I feel pretty good about this range since it has covered the basicis.
# Now let's make a new feature for these titles.

# In[32]:

train.Title = train.Name.map(lambda x: HumanName(x).title)

# In[33]:

print
train[train.Title == ''].Name
print
Пример #17
0
#Variable for yesterday's date or completion date. I modified it to enter the date myself as date of completion wasn't always the day
#prior
yesterday_date = easygui.enterbox("Enter date of completion")

#Loop for the email that will be made. Body of text can be altered here. chrg_tp determines if the work was warranty related or not.
for chrg_tp, w, tp, ml, nm, bldg, c, c2, alt_w, ml2 in zip(
        charge_type, wo, type, email, name, building, cc, cc2, alt_wo, email2):
    if chrg_tp == "EXTERNAL CHARGE":
        Emailer(
            """<p>Good Morning {0} and {1}!
                </p><p>Our service team completed a work order at {2} (WO# {4}) on {3}.
                </p><p>Any necessary closing documentation will follow soon from our A.R. department.
                </p><p>Please let us know if you have any questions or need any additional information regarding the completed work.
                </p><p>All of us at Company thank you for your continued business!</p>"""
            .format(
                HumanName(nm).first,
                HumanName(ml2).first, bldg, yesterday_date,
                str(alt_w)).replace(" and !", "!", 1).replace(
                    "T&M Roof Leak", "leak repairs", 1).replace(
                        "T&M Waterproofing Leak", "leak repairs", 1).replace(
                            "new penetration installation",
                            "a new penetration installation", 1).replace(
                                "RECALL 2nd Trip", "",
                                1).replace("RECALL 3rd+ Trip", "", 1).replace(
                                    "Roof Repair Work",
                                    "roof repairs", 1).replace(
                                        "(from KPC report)", "",
                                        1).replace("T&M", "a", 1).replace(
                                            "combined", "",
                                            1).replace("Sealant Work",
                                                       "sealant work",
Пример #18
0
def get_lname(somename):
    name = HumanName(somename)
    return name.last
Пример #19
0
    def namer(field):
        #pre
        if type(field) == tuple:
            w_name = re.sub(
                '[\t\r\n]', '',
                ", ".join([x.encode('ascii', 'ignore')
                           for x in field])).upper()
        else:
            w_name = re.sub('[\t\r\n]', '', field.encode('ascii',
                                                         'ignore')).upper()
        if 'ANONYMOUS' not in w_name:
            if ' FORMER ' not in w_name:
                w_name = re.split(";", w_name)[0]
            else:
                w_name = re.split(";", w_name)[1]

            w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name)  #6A, 4A-C

            out = HumanName(w_name)
            out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle)
            if " " in out.last:
                out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last)
            if re.sub("^[A-Z]\.|^[A-Z]", '',
                      out.first) == '' and len(out.middle) != 0:
                out.first, out.middle = out.middle, ""
            else:
                out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first)

            #post

            if out.middle.startswith("FOR ") or out.middle.startswith(
                    "- "):  #7A, 1B, 3E
                out.middle = ""

            if " FOR " in out.last:
                out.last = re.sub(" FOR .*", '', out.last)

            if len(out.last) == 0 and len(out.title) != 0:  #9A
                if " " in out.first:
                    out = HumanName(out.first)
                else:
                    out.first, out.last = "", out.first

            if " AND " in out.middle or " & " in out.middle:
                out.last = re.split("( AND )|( & )", out.middle)[0]
                out.middle = ""
            if "AND" in out.last or "&" in out.last:

                if out.last.startswith("AND ") or out.last.startswith(
                        "& "):  #3F
                    out.last = HumanName(out.last).last
                elif " AND " in out.last or " & " in out.last:
                    out.last = re.sub("( AND ).*|( & ).*", '', out.last)
            out.first = re.split("( AND )|&|/|\+", out.first)[0]
            out.last = re.split("/", out.last)[0].strip()
            if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last:
                out.first = out.last.split(" ")[0]
                out.last = out.last.split(" ")[1]
            out.capitalize()
            first, last = out.first, out.last
            if len(out.middle) > 0:
                if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '':
                    out.middle = ""
                elif first.endswith("-") or out.middle.startswith("-"):
                    first += out.middle
                else:
                    first += " %s" % out.middle  #8A-B
            if len(out.suffix) > 0:
                last += " %s" % out.suffix  #2A
            return (first, last)
        else:
            name = HumanName(w_name)
            return (name.first, name.last)
Пример #20
0
def middlename_filter(string):
    if string:
        return HumanName(string).middle
    else:
        return 'N/A'
Пример #21
0
def printLabel(user, eventName):
    FONT_NAME = "/usr/share/fonts/truetype/DejaVuSans.ttf"
    f17 = ImageFont.truetype(FONT_NAME, 17)
    f18 = ImageFont.truetype(FONT_NAME, 18)
    f25 = ImageFont.truetype(FONT_NAME, 25)
    f40 = ImageFont.truetype(FONT_NAME, 40)
    f80 = ImageFont.truetype(FONT_NAME, 80)
    f100 = ImageFont.truetype(FONT_NAME, 100)

    name = HumanName(user['Name'])

    ticket = user['Tickets'].split()
    if 'Abend' in user['Tickets']:
        ticket[0] += ' ' + ticket[1]

    mitarbeiter = False
    if 'Mitarbeiter' in user['Tickets']:
        mitarbeiter = True
    if 'Free' in user['Tickets']:
        mitarbeiter = True

    for i in range(2):
        img = Image.new("RGB", imgSize, fillWhite)
        draw = ImageDraw.Draw(img)

        printCenter(draw, 50, (name.first.capitalize() + ' ' + name.middle.capitalize()).strip(), f100)
        printCenter(draw, 170, name.last.capitalize(), f40)

        if i == 0:
            if mitarbeiter:
                printCenter(draw, 240, "Mitarbeiter", f80)

            printCenter(draw, 350, ticket[0] + ' (' + user['order'] + ')', f40)
            printCenter(draw, 400, eventName, f40)

            if 'Alter' in user:
                printCenter(draw, 450, "Alter: " + str(user['Alter']), f25)

            if 'Seminare und Workshops' in user:
                seminar = user['Seminare und Workshops'].split('(')
                printCenter(draw, 485, seminar[0], f25)
        else:
            text = """
Samstag
11.30 Uhr - “Zwischen Heimweh und Fernsucht”
13.00 Uhr - Mittagessen
14.30 Uhr - Seminare & Workshops
16.30 Uhr - “We will block you”
18.00 Uhr - Abendessen
20.00 Uhr - “Comming Home”
22.00 Uhr - Latenightangebote & Konzerte"""
            printLeft(draw, 0, 240, text, f17)

            text = """
Sonntag
08.00 Uhr - Frühstück
09.30 Uhr - “Dieser Weg wird kein Leichter sein”
12.00 Uhr - Mittagessen
13.30 Uhr - “Ist herzlich Willkommen übertrieben?”
14.30 Uhr - Abreise

Einlass jeweils 15 Minuten vor Veranstaltungsbeginn"""
            printLeft(draw, 450, 240, text, f17)

            text = """
Solltest du Erste Hilfe benötigen, erreichst du das Connect-Notfall-Team unter 
der Telefonnummer 0170 - 27 65 185 oder du meldest dich am Infopoint."""

            printLeft(draw, 0, 450, text, f18)

        img.save('tmp.png')

        qlr = BrotherQLRaster(CONFIG['printer'])
        qlr.exception_on_warning = True
        convert(qlr, ['tmp.png'], '54', cut=True, dither=False, compress=True, red=False, dpi_600=False, hq=True, rotate=90)
        send(instructions=qlr.data, printer_identifier=CONFIG['identifier'], backend_identifier=CONFIG['backend'], blocking=True)
Пример #22
0
def lastname_filter(string):
    if string:
        return HumanName(string).last
    else:
        return 'N/A'
Пример #23
0
 def _f_clean_func(self, string):
     human = HumanName(string)
     return human.first, human.middle, human.last
Пример #24
0
    def parse_rss_2_cap(message):
        """
            Parse RSS Feeds into the CAP Module
        """

        db = current.db
        s3db = current.s3db
        table = s3db.msg_rss
        message_id = message.message_id
        record = db(table.message_id == message_id).select(
            table.id,
            table.channel_id,
            table.title,
            table.from_address,
            table.body,
            table.date,
            table.location_id,
            table.author,
            limitby=(0, 1)).first()
        if not record:
            return

        pstable = s3db.msg_parsing_status
        # not adding (pstable.channel_id == record.channel_id) to query
        # because two channels (http://host.domain/eden/cap/public.rss and
        # (http://host.domain/eden/cap/alert.rss) may contain common url
        # eg. http://host.domain/eden/cap/public/xx.cap
        pquery = (pstable.message_id == message_id)
        prows = db(pquery).select(pstable.id, pstable.is_parsed)
        for prow in prows:
            if prow.is_parsed:
                return

        alert_table = s3db.cap_alert
        info_table = s3db.cap_info

        # Is this an Update or a Create?
        # @ToDo: Use guid?
        # Use Body
        body = record.body or record.title
        query = (info_table.description == body)
        exists = db(query).select(info_table.id, limitby=(0, 1)).first()

        author = record.author
        if author:
            ptable = s3db.pr_person
            # https://code.google.com/p/python-nameparser/
            from nameparser import HumanName
            name = HumanName(author)
            first_name = name.first
            middle_name = name.middle
            last_name = name.last
            query = (ptable.first_name == first_name) & \
                    (ptable.middle_name == middle_name) & \
                    (ptable.last_name == last_name)
            pexists = db(query).select(ptable.id, limitby=(0, 1)).first()
            if pexists:
                person_id = pexists.id
            else:
                person_id = ptable.insert(first_name=first_name,
                                          middle_name=middle_name,
                                          last_name=last_name)
                s3db.update_super(ptable, dict(id=person_id))
        else:
            person_id = None

        if exists:
            # @ToDo: Use XSLT
            info_id = exists.id
            db(info_table.id == info_id).update(
                headline=record.title,
                description=body,
                created_on=record.date,
                #location_id = record.location_id,
                #person_id = person_id,
            )

        else:
            # Embedded link
            url = record.from_address
            import_xml = s3db.resource("cap_alert").import_xml
            stylesheet = os.path.join(current.request.folder, "static",
                                      "formats", "cap", "import.xsl")
            try:
                file = fetch(url)
            except urllib2.HTTPError, e:
                import base64
                rss_table = s3db.msg_rss_channel
                query = (rss_table.channel_id == record.channel_id)
                channel = db(query).select(rss_table.date,
                                           rss_table.etag,
                                           rss_table.url,
                                           rss_table.username,
                                           rss_table.password,
                                           limitby=(0, 1)).first()
                username = channel.username
                password = channel.password
                if e.code == 401 and username and password:
                    request = urllib2.Request(url)
                    base64string = base64.encodestring("%s:%s" %
                                                       (username, password))
                    request.add_header("Authorization",
                                       "Basic %s" % base64string)
                else:
                    request = None

                try:
                    file = urllib2.urlopen(
                        request).read() if request else fetch(url)
                except urllib2.HTTPError, e:
                    # Check if there are links to look into
                    from urlparse import urlparse
                    ltable = s3db.msg_rss_link
                    query_ = (ltable.rss_id
                              == record.id) & (ltable.deleted != True)
                    rows_ = db(query_).select(ltable.type, ltable.url)
                    url_format = "{uri.scheme}://{uri.netloc}/".format
                    url_domain = url_format(uri=urlparse(url))
                    for row_ in rows_:
                        url = row_.url
                        if url and row_.type == "application/cap+xml" and \
                           url_domain == url_format(uri=urlparse(url)):
                            # Same domain, so okey to use same username/pwd combination
                            if e.code == 401 and username and password:
                                request = urllib2.Request(url)
                                request.add_header("Authorization",
                                                   "Basic %s" % base64string)
                            else:
                                request = None
                            try:
                                file = urllib2.urlopen(
                                    request).read() if request else fetch(url)
                            except urllib2.HTTPError, e:
                                current.log.error(
                                    "Getting content from link failed: %s" % e)
                            else:
                                # Import via XSLT
                                import_xml(StringIO(file),
                                           stylesheet=stylesheet,
                                           ignore_errors=True)
Пример #25
0
def parse_name(name):
    h = HumanName(name); h.capitalize()
    return {'firstname' : "%s %s" % (h.first, h.middle),
            'lastname' : "%s %s" % (h.last, h.suffix)}
Пример #26
0
    def parse_rss_2_cms(message):
        """
            Parse Feeds into the CMS Module
        """

        db = current.db
        s3db = current.s3db
        table = s3db.msg_rss
        record = db(table.message_id == message.message_id).select(
            table.channel_id,
            table.title,
            table.from_address,
            table.body,
            table.date,
            table.location_id,
            table.tags,
            table.author,
            limitby=(0, 1)).first()
        if not record or not record.body:
            return

        post_table = s3db.cms_post

        # Is this an Update or a Create?
        body = record.body or record.title
        url = record.from_address
        if url:
            doc_table = s3db.doc_document
            exists = db(doc_table.url == url).select(doc_table.doc_id,
                                                     limitby=(0, 1)).first()
            if exists:
                exists = db(post_table.doc_id == exists.doc_id).select(
                    post_table.id, limitby=(0, 1)).first()
        else:
            # Use Body
            exists = db(post_table.body == body).select(post_table.id,
                                                        limitby=(0,
                                                                 1)).first()

        channel_id = record.channel_id
        tags = record.tags

        author = record.author
        if author:
            ptable = s3db.pr_person
            # https://code.google.com/p/python-nameparser/
            from nameparser import HumanName
            name = HumanName(author)
            first_name = name.first
            middle_name = name.middle
            last_name = name.last
            query = (ptable.first_name == first_name) & \
                    (ptable.middle_name == middle_name) & \
                    (ptable.last_name == last_name)
            pexists = db(query).select(ptable.id, limitby=(0, 1)).first()
            if pexists:
                person_id = pexists.id
            else:
                person_id = ptable.insert(first_name=first_name,
                                          middle_name=middle_name,
                                          last_name=last_name)
                s3db.update_super(ptable, dict(id=person_id))
        else:
            person_id = None

        if exists:
            post_id = exists.id
            db(post_table.id == post_id).update(
                title=record.title,
                body=body,
                created_on=record.date,
                location_id=record.location_id,
                person_id=person_id,
            )
            # Read existing Tags (which came from remote)
            ttable = db.cms_tag
            ltable = db.cms_tag_post
            query = (ltable.post_id == post_id) & \
                    (ltable.mci == 1) & \
                    (ltable.tag_id == ttable.id)
            rows = db(query).select(ttable.name)
            # Compare these to tags in current version of post
            old_tags = [r.name for r in rows]
            new_tags = []
            delete_tags = []
            for tag in tags:
                if tag not in old_tags:
                    new_tags.append(tag)
            for tag in old_tags:
                if tag not in tags:
                    delete_tags.append(tag)
            if new_tags or delete_tags:
                lookup_tags = []
                lookup_tags.extend(new_tags)
                lookup_tags.extend(delete_tags)
                _tags = db(ttable.name.belongs(lookup_tags)).select(
                    ttable.id,
                    ttable.name,
                ).as_dict(key="name")
            for t in new_tags:
                tag = _tags.get(t, None)
                if tag:
                    tag_id = tag["id"]
                else:
                    tag_id = ttable.insert(name=t)
                ltable.insert(
                    post_id=post_id,
                    tag_id=tag_id,
                    mci=1,  # This is an imported record, not added natively
                )
            for t in delete_tags:
                tag = _tags.get(t, None)
                if tag:
                    query = (ltable.post_id == post_id) & \
                            (ltable.tag_id == tag["id"]) & \
                            (ltable.mci == 1) & \
                            (ltable.deleted == False)
                    db(query).delete()

        else:
            # Default to 'News' series
            table = db.cms_series
            series = db(table.name == "News").select(table.id,
                                                     cache=s3db.cache,
                                                     limitby=(0, 1)).first()
            try:
                series_id = series.id
            except:
                raise KeyError("News Series not present in CMS module")

            post_id = post_table.insert(
                title=record.title,
                body=body,
                created_on=record.date,
                location_id=record.location_id,
                person_id=person_id,
                series_id=series_id,
                mci=1,  # This is an imported record, not added natively
            )
            record = dict(id=post_id)
            s3db.update_super(post_table, record)

            # Source link
            if url:
                doc_table.insert(
                    doc_id=record["doc_id"],
                    url=url,
                )

            if tags:
                ttable = db.cms_tag
                ltable = db.cms_tag_post
                _tags = db(ttable.name.belongs(tags)).select(
                    ttable.id,
                    ttable.name,
                ).as_dict(key="name")
                for t in tags:
                    tag = _tags.get(t, None)
                    if tag:
                        tag_id = tag["id"]
                    else:
                        tag_id = ttable.insert(name=t)
                    ltable.insert(
                        post_id=post_id,
                        tag_id=tag_id,
                        mci=1,  # This is an imported record, not added natively
                    )

        # No Reply
        return
 def extract_title(self):
     """[summary]
     """
     self.Xy["title"] = (
         self.Xy.name.apply(lambda x: HumanName(x).title).replace(
             self.title_translator).replace({"\.": ""}, regex=True))
Пример #28
0
    except IOError as e:
        print "Error writing to output file: I/O error({0}): {1}".format(
            e.errno, e.strerror)
        sys.exit(1)

#csv writer helper. modify header if you need a custom csv format
csvwriter = csv.writer(f2)
csvwriter.writerow(['Name', 'Email Address'])

#do work, son
soup = BeautifulSoup(f1)

#modify find queries as needed
for contact in soup.find_all("div", {"class": "tcell"}):
    try:
        nameNode = contact.div
        if not nameNode:
            continue
        name = HumanName(nameNode.span.b.text.strip())
        emailNode = contact.find("div", {"class": "c"})
        if (emailNode):
            csvwriter.writerow([
                unicode(name).encode('utf-8'),
                emailNode.text.encode('utf-8')
            ])
    except:
        print "error with: "
        print contact
f1.close()
f2.close()
Пример #29
0
 def _generate_lastName(self):
     tmp = extract(RULES["name"], self.sec)
     self.lastName = HumanName(tmp).last
Пример #30
0
def GetNameLink(name):
    # Finds and returns formatted name and wikilinks for given name.
    if name == "":
        return ["", ""]
    # old_name = name
    name = name.replace(". ", ".").replace(".", ". ")
    split_name = name.split(" ")
    mixed_case = [
        f for f in split_name
        if (not f.islower() and not f.isupper()) or f.islower()
    ]
    surname_index = 0
    if mixed_case != []:
        surname_index = split_name.index(mixed_case[-1]) + 1
    first_names = " ".join(split_name[:surname_index])
    surname = HumanName(" ".join(split_name[surname_index:]))
    surname.capitalize(force=True)
    name = (first_names + " " + str(surname)).strip()

    global name_links
    global corrections
    global new_names

    key = LowerName(name)  # key for name in names dict
    if key in name_links:
        links = name_links[key]
    else:
        page_text = GetSoup(
            "https://en.wikipedia.org/wiki/" + name.replace(" ", "_"),
            False).text
        page_text = "" if page_text == None else page_text
        title = name  # player's article's title
        player_page = [
            "International Tennis Federation", "Prize money", "Grand Slam",
            "tennis career", "Wikipedia does not have", "WTA", "ITF", "ATP"
        ]
        disamb_page = ["may refer to"]
        disamb = " (tennis)"
        is_disamb = False
        pipe = False  # pipe [[title|name]] instead of [[title]].
        if "Redirected from" in page_text:  # redirected
            soup = GetSoup(page_text, True)
            title = str(soup.title.string).replace(" - Wikipedia", "").replace(
                " – Wikipedia", "").strip()
            if "tennis" in title or any([
                    f in page_text for f in disamb_page
            ]):  # redirected to disambiguated page, or disamb page
                is_disamb = True
                pipe = True
                title = re.sub(r" \(.*\)", "", title)
                name = title
            # pipe = True # display English spelling/maiden name (e.g. "Margaret Court" instead of "Margaret Smith" before she married).

        if (
                not any([f in page_text for f in player_page])
                or any([f in page_text
                        for f in disamb_page]) and page_text != ""
        ):  # article exists for name but for different person, or disamb page
            is_disamb = True
            pipe = True

        wikilink = "[[" + title + (disamb if is_disamb else
                                   "") + ("|" + name if pipe else "") + "]]"
        split_name = title.split(" ")
        abbr_name = "-".join(
            f[0] for f in split_name[0].split("-")
        ) + " " + " ".join(
            split_name[1:]
        )  # reduce name to first name initials + last name, e.g. "J.-L. Struff"
        abbr_wikilink = "[[" + title + (disamb if is_disamb else
                                        "") + "|" + abbr_name + "]]"
        name_links[key] = [wikilink, abbr_wikilink]
        links = name_links[key]

        # # add entry to new names list
        # exists = "Diese Seite existiert nicht" not in page_text
        # disamb = disamb if is_disamb else ""
        # link = f'<a href="https://en.wikipedia.org/wiki/{title}{disamb}" style="color:{"blue" if exists else "red"};">{title}{disamb}</a>'
        # new_names += f"\t<li>{old_name} → [[{abbr_wikilink.replace(title + disamb, link)}]]</li>"

    if "|" in links[0]:
        corrections_key = links[0][:links[0].index("|")]
    else:
        corrections_key = links[0]
    corrections_key = LowerName(corrections_key).strip("[]")
    if corrections_key in corrections[0]:  # name has correction
        links = corrections[0][corrections_key]
    abbr = links[1][links[1].index("|") + 1:] if "|" in links[1] else links[1]
    abbr = abbr.strip("[]")
    if abbr in corrections[1]:
        links[1] = links[1][:links[1].index("|") +
                            1] + corrections[1][abbr] + "]]"
    return links