def download(seed): url = geturl() if (url == None): return None print(url) locs = getlocs(url[0]) number = len(locs) cname = url[1].split('/')[0].strip() c1name = rmv(cname) with open('companies.csv', 'a') as f: f.write(c1name.strip() + ',' + url[2] + ',' + url[3] + ',' + url[4] + ',' + locs[0] + ',') nms = [] for r in locs[1:]: flgname = rmv(r[0]) name = HumanName(flgname) if (name.first == '' or name.last == ''): continue nms.append(flgname) nms = ','.join(nms) f.write(nms + '\n') try: loop = asyncio.get_event_loop() future = asyncio.ensure_future(run(number - 1, locs[1:], seed)) rows = loop.run_until_complete(future) except: loop.stop() print("retrying " + url[0]) return url with open('emails.csv', 'a') as f: for row in rows: if (row == []): continue if (url[1] in row[1]): row[1] = row[1].replace(url[1], '').strip() row[1] = row[1][:row[1].rfind(' ')] elif (cname in row[1]): row[1] = row[1].replace(cname, '').strip() row[1] = row[1][:row[1].rfind(' ')] row[1] = row[1].split(',')[0] row[1] = row[1].split('-')[0] row[1] = rmv(row[1]) row[1] = row[1].replace(" De ", ' ').replace(" And ", ' ').replace( " In ", ' ').replace(" For ", ' ').replace(" Of ", ' ').replace(" En ", ' ') if ('.html' in row[-1]): f.write( rmv(row[0]).strip() + ',' + row[1].strip() + ',' + c1name.strip() + '\n') continue else: f.write( rmv(row[0]).strip() + ',' + row[1].strip() + ',' + c1name.strip()) dups = [] for r in row[3:-1]: if (test_email(r.strip()) and r.strip() not in dups): f.write(',' + r.strip()) dups.append(r.strip()) if (test_email(row[-1].strip()) and row[-1].strip() not in dups): f.write(row[-1].strip() + '\n') else: f.write('\n') update() return url
def parseNames(fullname): return HumanName(fullname)
def extract_title(self): self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
def extract_last_name(self): "Extracts last name from name feature using nameparser." self.Xy["last_name"] = self.Xy.name.apply(lambda x: HumanName(x).last)
def clean_last_name(self): Hname = HumanName(' '.join([self.first_name, self.last_name])) Hname.capitalize() self.last_name = Hname.last
def _generate_lastName(self): if "name" in self.parse_data.keys(): if self.parse_data["name"]: self.lastName = HumanName(self.parse_data["name"]).last
def split_name(name): n = HumanName(name) return pd.Series([n.first, n.last])
def parse_rss(message): """ Parse Feeds into the CMS Module """ db = current.db s3db = current.s3db table = s3db.msg_rss record = db(table.message_id == message.message_id).select(table.channel_id, table.title, table.from_address, table.body, table.date, table.location_id, table.tags, table.author, limitby=(0, 1) ).first() if not record: return post_table = s3db.cms_post # Is this an Update or a Create? body = record.body or record.title url = record.from_address if url: doc_table = s3db.doc_document exists = db(doc_table.url == url).select(doc_table.doc_id, limitby=(0, 1) ).first() if exists: exists = db(post_table.doc_id == exists.doc_id).select(post_table.id, limitby=(0, 1) ).first() else: # Use Body exists = db(post_table.body == body).select(post_table.id, limitby=(0, 1) ).first() channel_id = record.channel_id tags = record.tags author = record.author if author: ptable = s3db.pr_person # https://code.google.com/p/python-nameparser/ from nameparser import HumanName name = HumanName(author) first_name = name.first middle_name = name.middle last_name = name.last query = (ptable.first_name == first_name) & \ (ptable.middle_name == middle_name) & \ (ptable.last_name == last_name) pexists = db(query).select(ptable.id, limitby=(0, 1) ).first() if pexists: person_id = pexists.id else: person_id = ptable.insert(first_name = first_name, middle_name = middle_name, last_name = last_name) s3db.update_super(ptable, dict(id=person_id)) else: person_id = None if exists: post_id = exists.id db(post_table.id == post_id).update(title = record.title, body = body, date = record.date, location_id = record.location_id, person_id = person_id, ) # Read existing Tags (which came from remote) ttable = db.cms_tag ltable = db.cms_tag_post query = (ltable.post_id == post_id) & \ (ltable.mci == 1) & \ (ltable.tag_id == ttable.id) rows = db(query).select(ttable.name) # Compare these to tags in current version of post old_tags = [r.name for r in rows] new_tags = [] delete_tags = [] for tag in tags: if tag not in old_tags: new_tags.append(tag) for tag in old_tags: if tag not in tags: delete_tags.append(tag) if new_tags or delete_tags: lookup_tags = [] lookup_tags.extend(new_tags) lookup_tags.extend(delete_tags) _tags = db(ttable.name.belongs(lookup_tags)).select(ttable.id, ttable.name, ).as_dict(key="name") for t in new_tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name = t) ltable.insert(post_id = post_id, tag_id = tag_id, mci = 1, # This is an imported record, not added natively ) for t in delete_tags: tag = _tags.get(t, None) if tag: query = (ltable.post_id == post_id) & \ (ltable.tag_id == tag["id"]) & \ (ltable.mci == 1) & \ (ltable.deleted == False) db(query).delete() else: # Default to 'News' series table = db.cms_series series_id = db(table.name == "News").select(table.id, cache=s3db.cache, limitby=(0, 1) ).first().id post_id = post_table.insert(title = record.title, body = body, date = record.date, location_id = record.location_id, person_id = person_id, series_id = series_id, mci = 1, # This is an imported record, not added natively ) record = dict(id=post_id) s3db.update_super(post_table, record) # Source link if url: doc_table.insert(doc_id = record["doc_id"], url = url, ) # Is this feed associated with an Org/Network? def lookup_pe(channel_id): ctable = s3db.msg_rss_channel channel_url = db(ctable.channel_id == channel_id).select(ctable.url, limitby=(0, 1) ).first().url ctable = s3db.pr_contact ptable = s3db.pr_pentity query = (ctable.contact_method == "RSS") & \ (ctable.value == channel_url) & \ (ctable.pe_id == ptable.pe_id) pe = db(query).select(ptable.pe_id, ptable.instance_type, limitby=(0, 1) ).first() if pe: pe_type = pe.instance_type otable = s3db[pe_type] org_id = db(otable.pe_id == pe.pe_id).select(otable.id, limitby=(0, 1), ).first().id return pe_type, org_id else: return None, None pe_type, org_id = current.cache.ram("pe_channel_%s" % channel_id, lambda: lookup_pe(channel_id), time_expire=120 ) if pe_type == "org_organisation": s3db.cms_post_organisation.insert(post_id=post_id, organisation_id=org_id, ) elif pe_type == "org_group": s3db.cms_post_organisation_group.insert(post_id=post_id, group_id=org_id, ) if tags: ttable = db.cms_tag ltable = db.cms_tag_post _tags = db(ttable.name.belongs(tags)).select(ttable.id, ttable.name, ).as_dict(key="name") for t in tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name = t) ltable.insert(post_id = post_id, tag_id = tag_id, mci = 1, # This is an imported record, not added natively ) # No Reply return
def populateObjectFromHTML(tree): proplist = [] base_url = "https://www.museumfuernaturkunde.berlin" # scrapes the contact info section arguments = ['Name','Email', 'Telefon', 'Fax', 'Adresse'] for info in arguments: try: proplist.append(tree.find('div', class_=("views-field views-field-"+info)).find('span', class_="field-content").get_text().replace("\r\n", ",")) except AttributeError: proplist.append(None) #scrapes the photo URI proplist.append(base_url + tree.find('div', class_="views-field views-field-img-URL").span.img.get('src')) #scrapes the accordion accordion = {} # get all accordion entries for element in tree.find_all('section', class_="ui_segment_accordion"): titel = normalizeTitel(re.sub(r"[^\w .()]", "", element.find('h2', class_="ui_segment_accordion__head").get_text()).strip()) # get all publications and parse them by <br/>'s if titel == "Publikationen": accordion[titel] = parseInformation(element, "ui_segment_accordion__content", 'list') # search in the "Forschung" entry for an "Forschungsprojekte" entry to extract it elif titel == "Forschung": research = [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p', 'h2'])] groupedResearch = [list(group) for k, group in groupby(research, lambda x: re.match(r".{0,5}(Forschungsprojekt|Projekt)e?:?$", x)) if not k] if len(groupedResearch) > 1: if 'Forschungsprojekte' in accordion: accordion['Forschungsprojekte'] += groupedResearch.pop(1) else: accordion['Forschungsprojekte'] = groupedResearch.pop(1) if len(groupedResearch) > 0: accordion[titel] = [element for sublist in groupedResearch for element in sublist if element] # if nothing matches just get the text of the element else: if titel in accordion: accordion[titel] += [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el] else: accordion[titel] = [el for el in [re.sub(r"[^\w .()-:/]", "",li.text.strip(' -')) for li in element.find('div', class_="ui_segment_accordion__content").find_all(['li', 'p'])] if el] proplist.append(accordion) #try to find additional informations try: for link in tree.find('div', class_="view-display-id-single_person_sidebar_view").findAll('a', href=True): print(link.text) titel = normalizeTitel(link.text.strip()) if titel == 'Lebenslauf': print("CV Gefunden") infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml') proplist[6]['CV'] = parseInformation(infoTree, "faqfield-answer", 'text') elif titel == 'Publikationen': print("Publikation Gefunden") infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml') if titel in proplist[6]: proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'list') else: proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'list') else: print("Etwas anderes Gefunden") infoTree = BeautifulSoup(requests.get(base_url+link.get('href')).text, 'lxml') if titel in proplist[6]: proplist[6][titel] += parseInformation(infoTree, "faqfield-answer", 'text') else: proplist[6][titel] = parseInformation(infoTree, "faqfield-answer", 'text') except AttributeError: print('No additional information was found') # delete extra headlines if 'Forschung' in proplist[6]: proplist[6]['Forschung'] = [val for val in proplist[6]['Forschung'] if not match(val)] if 'Publikationen' in proplist[6]: proplist[6]['Publikationen'] = [val for val in proplist[6]['Publikationen'] if not match(val)] # set up the name parser constants = Constants() constants.titles.add('PD', 'Dipl.', 'des.', 'Professor', 'M.Sc.', 'FH') # parse the name and normalize weird writing styles for "Ph.D." proplist[0] = HumanName(re.sub(r"Ph\. D\.", "Ph.D." ,proplist[0]), constants=constants).as_dict() return proplist
def index(name): try: return_name = HumanName(name) return jsonify(return_name.as_dict()) except Exception as e: return jsonify(str(e))
def normalize_person_name(person_name): new_string = name_reg_ex.sub("", person_name) name = HumanName(new_string) name.capitalize(force=True) return str(name)
a_list = [] for i in range(1,261): url = "https://digitalcommons.calpoly.edu/csse_fac/" + str(i) try: myRequest = requests.get(url, verify=False) soup = BeautifulSoup(myRequest.text,"html.parser") for eventRow in soup.find_all('div',attrs={'id':'recommended_citation'}): for item in eventRow.find_all('p',attrs={'class':'comments'}): if item.find('em') is not None: for items in item.find('em'): journal = items for eventRow in soup.find_all('p',attrs={'class':'author'}): for items in eventRow.find_all('strong'): name = HumanName(items.text) a_list.append((soup.head.title.get_text().split('by')[0].strip(),name.first,name.last,journal)) print(a_list) except ConnectionError: print('Whoops') for item in a_list: print(item) try: connection = pymysql.connect( host="localhost", user="******", passwd="****", database="jmabel466",
from nameparser import HumanName with open("tempName", 'rb') as g: lines = g.readlines() authorList = "" for line in lines: authors = HumanName(line) authorList += authors.last + ", " + authors.first + " ; " print(authorList)
def normalize_name(name): """ Normalize a name for sorting and indexing. This uses two powerful python libraries for differing reasons. `probablepeople` contains a discriminator between company and person names. This is used to determine whether to parse into last, first, middle or to leave the name alone. However, the actual name parser in `probablepeople` is unnecessarily complex, so that strings that it determines to be human names are parsed instead by the simpler `nameparser`. """ sname = name.strip() # remove leading and trailing spaces # Recognizer tends to mistake concatenated initials for Corporation name. # Pad potential initials with spaces before running recognizer # For any character A-Z followed by "." and another character A-Z, add a space after the first. # (?=[A-Z]) means to find A-Z after the match string but not match it. nname = re.sub("(?P<thing>[A-Z]\\.)(?=[A-Z])", "\\g<thing> ", sname) try: # probablepeople doesn't understand utf-8 encoding. Hand it pure unicode. _, type = probablepeople.tag(nname) # discard parser result except probablepeople.RepeatedLabelError: # if it can't understand the name, it's foreign type = 'Unknown' if type == 'Corporation': return sname # do not parse and reorder company names # special case for capitalization: flag as corporation if (adjacent_caps.match(sname)): return sname # treat anything else as a human name nameparts = HumanName(nname) normalized = "" if nameparts.last: normalized = nameparts.last if nameparts.suffix: if not normalized: normalized = nameparts.suffix else: normalized = normalized + ' ' + nameparts.suffix if normalized: normalized = normalized + ',' if nameparts.title: if not normalized: normalized = nameparts.title else: normalized = normalized + ' ' + nameparts.title if nameparts.first: if not normalized: normalized = nameparts.first else: normalized = normalized + ' ' + nameparts.first if nameparts.middle: if not normalized: normalized = nameparts.middle else: normalized = ' ' + normalized + ' ' + nameparts.middle return normalized.strip()
def match_by_name(name, state=None, office=None, cycle=None, reverse_name_order=False): result_array = [] name1 = HumanName(name) name1_standardized = None blocking_name = None # sometimes we run into a name that's flipped: if reverse_name_order: print "Running name reversal check!" blocking_name = simple_clean(name1.first) name1_standardized = simple_clean(name1.first) + " " + unnickname( name1.last) else: name1_standardized = simple_clean(name1.last) + " " + unnickname( name1.first) blocking_name = simple_clean(name1.last) # if we can't find the last name, assume the name is the last name. This might be a bad idea. if not blocking_name: blocking_name = simple_clean(name) possible_matches = block_by_startswith(blocking_name, starts_with_blocklength, state, office, cycle) for match in possible_matches: name2_name = HumanName(match['cand_name']) name2 = simple_clean(name2_name.last) + " " + unnickname( name2_name.first) # calculate a buncha metrics text1 = name1_standardized text2 = name2 #print "comparing '%s' to '%s'" % (text1, text2) ratio = 1 / 100.0 * fuzz.ratio(text1, text2) partial_ratio = 1 / 100.0 * fuzz.partial_ratio(text1, text2) token_sort_ratio = 1 / 100.0 * fuzz.token_sort_ratio(text1, text2) token_set_ratio = 1 / 100.0 * fuzz.token_set_ratio(text1, text2) avg_len = 1 / 2 * len(text1) + len(text2) min_len = min(len(text1), len(text2)) l_ratio = 0 try: l_distance = jellyfish.levenshtein_distance(text1, text2) l_ratio = 1.0 - ((0.0 + l_distance) / (0.0 + avg_len)) except UnicodeEncodeError: pass long_match = longest_match(text1, text2) lng_ratio = (0.0 + long_match) / (0.0 + min_len) score = 0 if (ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6): score = compute_scores([ratio, partial_ratio, l_ratio, lng_ratio]) if debug: log.debug( "|fuzzymatchresult|%s|'%s'|'%s'|score=%s|ratio=%s|partial_ratio=%s|token_sort_ratio=%s|token_set_ratio=%s| l_ratio=%s|lng_ratio=%s" % (match['cand_id'], match['cand_name'], name, score, ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio)) if (score > 0.8): name_standardized = standardize_name_from_dict(match) result_array.append({ 'name': name_standardized, 'id': match['cand_id'], 'score': score, 'type': [], 'match': False }) if debug: log.debug("Match found: %s" % name_standardized) if debug and len(result_array) == 0: log.debug("No match for %s, which was standardized to: %s" % (name, name1_standardized)) # If it's a good match and there's only one, call it a definite match. if (len(result_array) == 1): if result_array[0]['score'] > 0.9: result_array[0]['match'] = True # surprisingly, google refine *doesn't* sort by score. return result_array
print "Now let's take care of those", train.Age.isnull().sum(), "null values" # In[30]: print "One idea would be to take the median age:", train.Age.median(), "or mean:", train.Age.mean(), "but I think we can get a clue from people's titles (ex Mr., Mrs.)" # First let's see what titles we have. # In[31]: titles = [] for name in train.Name: titles.append(HumanName(name).title) print set(titles) # The titles look good, expect there's an empty string, perhaps that's for the less common titles, but I feel pretty good about this range since it has covered the basicis. # Now let's make a new feature for these titles. # In[32]: train.Title = train.Name.map(lambda x: HumanName(x).title) # In[33]: print train[train.Title == ''].Name print
#Variable for yesterday's date or completion date. I modified it to enter the date myself as date of completion wasn't always the day #prior yesterday_date = easygui.enterbox("Enter date of completion") #Loop for the email that will be made. Body of text can be altered here. chrg_tp determines if the work was warranty related or not. for chrg_tp, w, tp, ml, nm, bldg, c, c2, alt_w, ml2 in zip( charge_type, wo, type, email, name, building, cc, cc2, alt_wo, email2): if chrg_tp == "EXTERNAL CHARGE": Emailer( """<p>Good Morning {0} and {1}! </p><p>Our service team completed a work order at {2} (WO# {4}) on {3}. </p><p>Any necessary closing documentation will follow soon from our A.R. department. </p><p>Please let us know if you have any questions or need any additional information regarding the completed work. </p><p>All of us at Company thank you for your continued business!</p>""" .format( HumanName(nm).first, HumanName(ml2).first, bldg, yesterday_date, str(alt_w)).replace(" and !", "!", 1).replace( "T&M Roof Leak", "leak repairs", 1).replace( "T&M Waterproofing Leak", "leak repairs", 1).replace( "new penetration installation", "a new penetration installation", 1).replace( "RECALL 2nd Trip", "", 1).replace("RECALL 3rd+ Trip", "", 1).replace( "Roof Repair Work", "roof repairs", 1).replace( "(from KPC report)", "", 1).replace("T&M", "a", 1).replace( "combined", "", 1).replace("Sealant Work", "sealant work",
def get_lname(somename): name = HumanName(somename) return name.last
def namer(field): #pre if type(field) == tuple: w_name = re.sub( '[\t\r\n]', '', ", ".join([x.encode('ascii', 'ignore') for x in field])).upper() else: w_name = re.sub('[\t\r\n]', '', field.encode('ascii', 'ignore')).upper() if 'ANONYMOUS' not in w_name: if ' FORMER ' not in w_name: w_name = re.split(";", w_name)[0] else: w_name = re.split(";", w_name)[1] w_name = re.sub("(?<=[`'/+]) | (?=['`/+])", '', w_name) #6A, 4A-C out = HumanName(w_name) out.middle = re.sub("^[A-Z] |^[A-Z]\. ", '', out.middle) if " " in out.last: out.last = re.sub("^[A-Z] |^[A-Z]\. ", '', out.last) if re.sub("^[A-Z]\.|^[A-Z]", '', out.first) == '' and len(out.middle) != 0: out.first, out.middle = out.middle, "" else: out.first = re.sub("^[A-Z] |^[A-Z]\. ", '', out.first) #post if out.middle.startswith("FOR ") or out.middle.startswith( "- "): #7A, 1B, 3E out.middle = "" if " FOR " in out.last: out.last = re.sub(" FOR .*", '', out.last) if len(out.last) == 0 and len(out.title) != 0: #9A if " " in out.first: out = HumanName(out.first) else: out.first, out.last = "", out.first if " AND " in out.middle or " & " in out.middle: out.last = re.split("( AND )|( & )", out.middle)[0] out.middle = "" if "AND" in out.last or "&" in out.last: if out.last.startswith("AND ") or out.last.startswith( "& "): #3F out.last = HumanName(out.last).last elif " AND " in out.last or " & " in out.last: out.last = re.sub("( AND ).*|( & ).*", '', out.last) out.first = re.split("( AND )|&|/|\+", out.first)[0] out.last = re.split("/", out.last)[0].strip() if len(re.sub("[^A-Z]", '', out.first)) == 1 and " " in out.last: out.first = out.last.split(" ")[0] out.last = out.last.split(" ")[1] out.capitalize() first, last = out.first, out.last if len(out.middle) > 0: if re.sub("^[A-Z]\.|^[A-Z]", '', out.middle) == '': out.middle = "" elif first.endswith("-") or out.middle.startswith("-"): first += out.middle else: first += " %s" % out.middle #8A-B if len(out.suffix) > 0: last += " %s" % out.suffix #2A return (first, last) else: name = HumanName(w_name) return (name.first, name.last)
def middlename_filter(string): if string: return HumanName(string).middle else: return 'N/A'
def printLabel(user, eventName): FONT_NAME = "/usr/share/fonts/truetype/DejaVuSans.ttf" f17 = ImageFont.truetype(FONT_NAME, 17) f18 = ImageFont.truetype(FONT_NAME, 18) f25 = ImageFont.truetype(FONT_NAME, 25) f40 = ImageFont.truetype(FONT_NAME, 40) f80 = ImageFont.truetype(FONT_NAME, 80) f100 = ImageFont.truetype(FONT_NAME, 100) name = HumanName(user['Name']) ticket = user['Tickets'].split() if 'Abend' in user['Tickets']: ticket[0] += ' ' + ticket[1] mitarbeiter = False if 'Mitarbeiter' in user['Tickets']: mitarbeiter = True if 'Free' in user['Tickets']: mitarbeiter = True for i in range(2): img = Image.new("RGB", imgSize, fillWhite) draw = ImageDraw.Draw(img) printCenter(draw, 50, (name.first.capitalize() + ' ' + name.middle.capitalize()).strip(), f100) printCenter(draw, 170, name.last.capitalize(), f40) if i == 0: if mitarbeiter: printCenter(draw, 240, "Mitarbeiter", f80) printCenter(draw, 350, ticket[0] + ' (' + user['order'] + ')', f40) printCenter(draw, 400, eventName, f40) if 'Alter' in user: printCenter(draw, 450, "Alter: " + str(user['Alter']), f25) if 'Seminare und Workshops' in user: seminar = user['Seminare und Workshops'].split('(') printCenter(draw, 485, seminar[0], f25) else: text = """ Samstag 11.30 Uhr - “Zwischen Heimweh und Fernsucht” 13.00 Uhr - Mittagessen 14.30 Uhr - Seminare & Workshops 16.30 Uhr - “We will block you” 18.00 Uhr - Abendessen 20.00 Uhr - “Comming Home” 22.00 Uhr - Latenightangebote & Konzerte""" printLeft(draw, 0, 240, text, f17) text = """ Sonntag 08.00 Uhr - Frühstück 09.30 Uhr - “Dieser Weg wird kein Leichter sein” 12.00 Uhr - Mittagessen 13.30 Uhr - “Ist herzlich Willkommen übertrieben?” 14.30 Uhr - Abreise Einlass jeweils 15 Minuten vor Veranstaltungsbeginn""" printLeft(draw, 450, 240, text, f17) text = """ Solltest du Erste Hilfe benötigen, erreichst du das Connect-Notfall-Team unter der Telefonnummer 0170 - 27 65 185 oder du meldest dich am Infopoint.""" printLeft(draw, 0, 450, text, f18) img.save('tmp.png') qlr = BrotherQLRaster(CONFIG['printer']) qlr.exception_on_warning = True convert(qlr, ['tmp.png'], '54', cut=True, dither=False, compress=True, red=False, dpi_600=False, hq=True, rotate=90) send(instructions=qlr.data, printer_identifier=CONFIG['identifier'], backend_identifier=CONFIG['backend'], blocking=True)
def lastname_filter(string): if string: return HumanName(string).last else: return 'N/A'
def _f_clean_func(self, string): human = HumanName(string) return human.first, human.middle, human.last
def parse_rss_2_cap(message): """ Parse RSS Feeds into the CAP Module """ db = current.db s3db = current.s3db table = s3db.msg_rss message_id = message.message_id record = db(table.message_id == message_id).select( table.id, table.channel_id, table.title, table.from_address, table.body, table.date, table.location_id, table.author, limitby=(0, 1)).first() if not record: return pstable = s3db.msg_parsing_status # not adding (pstable.channel_id == record.channel_id) to query # because two channels (http://host.domain/eden/cap/public.rss and # (http://host.domain/eden/cap/alert.rss) may contain common url # eg. http://host.domain/eden/cap/public/xx.cap pquery = (pstable.message_id == message_id) prows = db(pquery).select(pstable.id, pstable.is_parsed) for prow in prows: if prow.is_parsed: return alert_table = s3db.cap_alert info_table = s3db.cap_info # Is this an Update or a Create? # @ToDo: Use guid? # Use Body body = record.body or record.title query = (info_table.description == body) exists = db(query).select(info_table.id, limitby=(0, 1)).first() author = record.author if author: ptable = s3db.pr_person # https://code.google.com/p/python-nameparser/ from nameparser import HumanName name = HumanName(author) first_name = name.first middle_name = name.middle last_name = name.last query = (ptable.first_name == first_name) & \ (ptable.middle_name == middle_name) & \ (ptable.last_name == last_name) pexists = db(query).select(ptable.id, limitby=(0, 1)).first() if pexists: person_id = pexists.id else: person_id = ptable.insert(first_name=first_name, middle_name=middle_name, last_name=last_name) s3db.update_super(ptable, dict(id=person_id)) else: person_id = None if exists: # @ToDo: Use XSLT info_id = exists.id db(info_table.id == info_id).update( headline=record.title, description=body, created_on=record.date, #location_id = record.location_id, #person_id = person_id, ) else: # Embedded link url = record.from_address import_xml = s3db.resource("cap_alert").import_xml stylesheet = os.path.join(current.request.folder, "static", "formats", "cap", "import.xsl") try: file = fetch(url) except urllib2.HTTPError, e: import base64 rss_table = s3db.msg_rss_channel query = (rss_table.channel_id == record.channel_id) channel = db(query).select(rss_table.date, rss_table.etag, rss_table.url, rss_table.username, rss_table.password, limitby=(0, 1)).first() username = channel.username password = channel.password if e.code == 401 and username and password: request = urllib2.Request(url) base64string = base64.encodestring("%s:%s" % (username, password)) request.add_header("Authorization", "Basic %s" % base64string) else: request = None try: file = urllib2.urlopen( request).read() if request else fetch(url) except urllib2.HTTPError, e: # Check if there are links to look into from urlparse import urlparse ltable = s3db.msg_rss_link query_ = (ltable.rss_id == record.id) & (ltable.deleted != True) rows_ = db(query_).select(ltable.type, ltable.url) url_format = "{uri.scheme}://{uri.netloc}/".format url_domain = url_format(uri=urlparse(url)) for row_ in rows_: url = row_.url if url and row_.type == "application/cap+xml" and \ url_domain == url_format(uri=urlparse(url)): # Same domain, so okey to use same username/pwd combination if e.code == 401 and username and password: request = urllib2.Request(url) request.add_header("Authorization", "Basic %s" % base64string) else: request = None try: file = urllib2.urlopen( request).read() if request else fetch(url) except urllib2.HTTPError, e: current.log.error( "Getting content from link failed: %s" % e) else: # Import via XSLT import_xml(StringIO(file), stylesheet=stylesheet, ignore_errors=True)
def parse_name(name): h = HumanName(name); h.capitalize() return {'firstname' : "%s %s" % (h.first, h.middle), 'lastname' : "%s %s" % (h.last, h.suffix)}
def parse_rss_2_cms(message): """ Parse Feeds into the CMS Module """ db = current.db s3db = current.s3db table = s3db.msg_rss record = db(table.message_id == message.message_id).select( table.channel_id, table.title, table.from_address, table.body, table.date, table.location_id, table.tags, table.author, limitby=(0, 1)).first() if not record or not record.body: return post_table = s3db.cms_post # Is this an Update or a Create? body = record.body or record.title url = record.from_address if url: doc_table = s3db.doc_document exists = db(doc_table.url == url).select(doc_table.doc_id, limitby=(0, 1)).first() if exists: exists = db(post_table.doc_id == exists.doc_id).select( post_table.id, limitby=(0, 1)).first() else: # Use Body exists = db(post_table.body == body).select(post_table.id, limitby=(0, 1)).first() channel_id = record.channel_id tags = record.tags author = record.author if author: ptable = s3db.pr_person # https://code.google.com/p/python-nameparser/ from nameparser import HumanName name = HumanName(author) first_name = name.first middle_name = name.middle last_name = name.last query = (ptable.first_name == first_name) & \ (ptable.middle_name == middle_name) & \ (ptable.last_name == last_name) pexists = db(query).select(ptable.id, limitby=(0, 1)).first() if pexists: person_id = pexists.id else: person_id = ptable.insert(first_name=first_name, middle_name=middle_name, last_name=last_name) s3db.update_super(ptable, dict(id=person_id)) else: person_id = None if exists: post_id = exists.id db(post_table.id == post_id).update( title=record.title, body=body, created_on=record.date, location_id=record.location_id, person_id=person_id, ) # Read existing Tags (which came from remote) ttable = db.cms_tag ltable = db.cms_tag_post query = (ltable.post_id == post_id) & \ (ltable.mci == 1) & \ (ltable.tag_id == ttable.id) rows = db(query).select(ttable.name) # Compare these to tags in current version of post old_tags = [r.name for r in rows] new_tags = [] delete_tags = [] for tag in tags: if tag not in old_tags: new_tags.append(tag) for tag in old_tags: if tag not in tags: delete_tags.append(tag) if new_tags or delete_tags: lookup_tags = [] lookup_tags.extend(new_tags) lookup_tags.extend(delete_tags) _tags = db(ttable.name.belongs(lookup_tags)).select( ttable.id, ttable.name, ).as_dict(key="name") for t in new_tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name=t) ltable.insert( post_id=post_id, tag_id=tag_id, mci=1, # This is an imported record, not added natively ) for t in delete_tags: tag = _tags.get(t, None) if tag: query = (ltable.post_id == post_id) & \ (ltable.tag_id == tag["id"]) & \ (ltable.mci == 1) & \ (ltable.deleted == False) db(query).delete() else: # Default to 'News' series table = db.cms_series series = db(table.name == "News").select(table.id, cache=s3db.cache, limitby=(0, 1)).first() try: series_id = series.id except: raise KeyError("News Series not present in CMS module") post_id = post_table.insert( title=record.title, body=body, created_on=record.date, location_id=record.location_id, person_id=person_id, series_id=series_id, mci=1, # This is an imported record, not added natively ) record = dict(id=post_id) s3db.update_super(post_table, record) # Source link if url: doc_table.insert( doc_id=record["doc_id"], url=url, ) if tags: ttable = db.cms_tag ltable = db.cms_tag_post _tags = db(ttable.name.belongs(tags)).select( ttable.id, ttable.name, ).as_dict(key="name") for t in tags: tag = _tags.get(t, None) if tag: tag_id = tag["id"] else: tag_id = ttable.insert(name=t) ltable.insert( post_id=post_id, tag_id=tag_id, mci=1, # This is an imported record, not added natively ) # No Reply return
def extract_title(self): """[summary] """ self.Xy["title"] = ( self.Xy.name.apply(lambda x: HumanName(x).title).replace( self.title_translator).replace({"\.": ""}, regex=True))
except IOError as e: print "Error writing to output file: I/O error({0}): {1}".format( e.errno, e.strerror) sys.exit(1) #csv writer helper. modify header if you need a custom csv format csvwriter = csv.writer(f2) csvwriter.writerow(['Name', 'Email Address']) #do work, son soup = BeautifulSoup(f1) #modify find queries as needed for contact in soup.find_all("div", {"class": "tcell"}): try: nameNode = contact.div if not nameNode: continue name = HumanName(nameNode.span.b.text.strip()) emailNode = contact.find("div", {"class": "c"}) if (emailNode): csvwriter.writerow([ unicode(name).encode('utf-8'), emailNode.text.encode('utf-8') ]) except: print "error with: " print contact f1.close() f2.close()
def _generate_lastName(self): tmp = extract(RULES["name"], self.sec) self.lastName = HumanName(tmp).last
def GetNameLink(name): # Finds and returns formatted name and wikilinks for given name. if name == "": return ["", ""] # old_name = name name = name.replace(". ", ".").replace(".", ". ") split_name = name.split(" ") mixed_case = [ f for f in split_name if (not f.islower() and not f.isupper()) or f.islower() ] surname_index = 0 if mixed_case != []: surname_index = split_name.index(mixed_case[-1]) + 1 first_names = " ".join(split_name[:surname_index]) surname = HumanName(" ".join(split_name[surname_index:])) surname.capitalize(force=True) name = (first_names + " " + str(surname)).strip() global name_links global corrections global new_names key = LowerName(name) # key for name in names dict if key in name_links: links = name_links[key] else: page_text = GetSoup( "https://en.wikipedia.org/wiki/" + name.replace(" ", "_"), False).text page_text = "" if page_text == None else page_text title = name # player's article's title player_page = [ "International Tennis Federation", "Prize money", "Grand Slam", "tennis career", "Wikipedia does not have", "WTA", "ITF", "ATP" ] disamb_page = ["may refer to"] disamb = " (tennis)" is_disamb = False pipe = False # pipe [[title|name]] instead of [[title]]. if "Redirected from" in page_text: # redirected soup = GetSoup(page_text, True) title = str(soup.title.string).replace(" - Wikipedia", "").replace( " – Wikipedia", "").strip() if "tennis" in title or any([ f in page_text for f in disamb_page ]): # redirected to disambiguated page, or disamb page is_disamb = True pipe = True title = re.sub(r" \(.*\)", "", title) name = title # pipe = True # display English spelling/maiden name (e.g. "Margaret Court" instead of "Margaret Smith" before she married). if ( not any([f in page_text for f in player_page]) or any([f in page_text for f in disamb_page]) and page_text != "" ): # article exists for name but for different person, or disamb page is_disamb = True pipe = True wikilink = "[[" + title + (disamb if is_disamb else "") + ("|" + name if pipe else "") + "]]" split_name = title.split(" ") abbr_name = "-".join( f[0] for f in split_name[0].split("-") ) + " " + " ".join( split_name[1:] ) # reduce name to first name initials + last name, e.g. "J.-L. Struff" abbr_wikilink = "[[" + title + (disamb if is_disamb else "") + "|" + abbr_name + "]]" name_links[key] = [wikilink, abbr_wikilink] links = name_links[key] # # add entry to new names list # exists = "Diese Seite existiert nicht" not in page_text # disamb = disamb if is_disamb else "" # link = f'<a href="https://en.wikipedia.org/wiki/{title}{disamb}" style="color:{"blue" if exists else "red"};">{title}{disamb}</a>' # new_names += f"\t<li>{old_name} → [[{abbr_wikilink.replace(title + disamb, link)}]]</li>" if "|" in links[0]: corrections_key = links[0][:links[0].index("|")] else: corrections_key = links[0] corrections_key = LowerName(corrections_key).strip("[]") if corrections_key in corrections[0]: # name has correction links = corrections[0][corrections_key] abbr = links[1][links[1].index("|") + 1:] if "|" in links[1] else links[1] abbr = abbr.strip("[]") if abbr in corrections[1]: links[1] = links[1][:links[1].index("|") + 1] + corrections[1][abbr] + "]]" return links