def right_answer(test_ans, gold_ans_list): if test_ans in gold_ans_list: return True for gold_ans in gold_ans_list: if normalize_string(test_ans) in normalize_string( gold_ans) or normalize_string(gold_ans) in normalize_string( test_ans): return True return False
def normalize_vcard_contact(vcard_data: vobject.base.Component) -> ContactData: uid = "0" categories = ['uncategorized'] if "fn" in vcard_data.contents: name = normalize_string(vcard_data.fn.value) else: raise RuntimeError("Contact has no name") if "categories" in vcard_data.contents: categories = vcard_data.categories.value contact = ContactData(uid, name, categories) return contact
def find_page_by_name(self, name: str, point=None, radius=None): """Find a single Wikipedia page given a name Search for the top 5 results. If there's a result with the exact same normalized name, choose that article. Otherwise, choose the first result. Note that even when the name is not exactly the same, the first result usually makes sense. For example, the first result for "Mount Rainier Wilderness" is "Mount Rainier National Park". And it would be ok to show the wikipedia page for the National Park when a user clicks on the wilderness polygon. Args: - name: name to search for - point: point in WGS84 to search around - radius: radius in meters to search around point """ if not isinstance(name, str): raise TypeError('name must be str') if point is not None: if not isinstance(point, Point): raise TypeError('point must be of type Point') if radius is not None and radius > 10000: raise ValueError('max radius 10,000 meters') if point is None: res = wikipedia.search(name, results=5) else: lon, lat = list(point.coords)[0] radius = 400 if radius is None else radius res = wikipedia.geosearch( latitude=lat, longitude=lon, title=name, results=5, radius=radius) exact_match = [ ind for ind, s in enumerate(res) if normalize_string(s) == normalize_string(name) ] choice = None if exact_match != []: choice = exact_match[0] else: choice = 0 # Here turn auto_suggest to False because I know the page name exists # Otherwise sometimes the page names redirect from a name that exists to # a name that does not. For example, searching # ``` # wikipedia.page('Shasta-Trinity National Forest') # ``` # raises `PageError: Page id "shasta trinity national forests" does not # match any pages. Try another id!`, while the page does exist: # https://en.wikipedia.org/wiki/Shasta%E2%80%93Trinity_National_Forest # See also # https://github.com/goldsmith/Wikipedia/issues/192 # https://github.com/goldsmith/Wikipedia/issues/176 return wikipedia.page(res[choice], auto_suggest=False)
def tensors_from_pair(input_lang, output_lang, pair): input_tensor = tensor_from_sentence(input_lang, pair[1]) target_tensor = tensor_from_sentence(output_lang, pair[0]) return (input_tensor, target_tensor) if __name__ == "__main__": device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") with open('input.pickle', 'rb') as f: input_lang = pickle.load(f) with open('target.pickle', 'rb') as f: target_lang = pickle.load(f) with open('../assets/SMSSpamCollection.txt') as f: lines = f.readlines() pairs = [[normalize_string(s) for s in line.split('\t')] for line in lines] # modelのロード hidden_size = 256 model = RNN(input_lang.n_words, target_lang.n_words, hidden_size).to(device) param = torch.load("model_data/model4.pth") for p in model.parameters(): print(p) model.load_state_dict(param) print("-" * 50) for p in model.parameters(): print(p) input_tensor = tensor_from_sentence(input_lang, pairs[1][1]).to(device)
def train_set_parser(publication_txt_path_prefix, publications_json_path, data_set_citations_json_path, data_sets_json_path, output_filename): citation_dict = dict() print("Loading data_set_citations.json file...") # open the publications.json file with open(data_set_citations_json_path) as json_data_set_citations: # parse it as JSON data_set_citations = json.load(json_data_set_citations) # loop for citaion_info in tqdm(data_set_citations, total=len(data_set_citations)): publication_id = citaion_info.get("publication_id", None) data_set_id = citaion_info.get("data_set_id", None) mention_list = citaion_info.get("mention_list", None) formatted_mention_list = [] for mention in mention_list: mention.encode('ascii', 'ignore') mention = normalize_string(mention) sentences = split_into_sentences(mention) words = [] for sentence in sentences: words += word_tokenize(sentence) words = [w for w in words if len(w) < 15] if len(words) > 0: formatted_mention_list.append(words) if publication_id in citation_dict: citation_dict[publication_id].append( [data_set_id, formatted_mention_list]) else: citation_dict[publication_id] = [[ data_set_id, formatted_mention_list ]] # set prefix to formatted publication txt files formatted_txt_path_prefix = "./formatted-data/" # set path to publications.json formatted_publications = dict() print("Tokenizing publication files...") # open the publications.json file with open(publications_json_path) as json_publication_file: # parse it as JSON publication_list = json.load(json_publication_file) # loop over the elements in the list for publication_info in tqdm(publication_list, total=len(publication_list)): # get information on publication: publication_id = publication_info.get("publication_id", None) text_file_name = publication_info.get("text_file_name", None) # get raw text raw_text = '' txt_file_path = publication_txt_path_prefix + text_file_name with open(txt_file_path) as txt_file: for line in txt_file: stripped_line = line.strip() raw_text += ' ' + stripped_line if len(stripped_line.split()) <= 5: raw_text += '<stop>' # marking for sentence boundary in split_into_sentences() function raw_text.encode('ascii', 'ignore') raw_text = normalize_string(raw_text) # add to formatted_publications dictionary formatted_text_list = [] sentences = split_into_sentences(raw_text) for sentence in sentences: words = word_tokenize(sentence) words = [w for w in words if len(w) < 15] if len(words) >= 10 and len(words) <= 30: formatted_text_list.append(words) formatted_publications[publication_id] = formatted_text_list # tag mentions in publication text and write in csv file output_filepath = formatted_txt_path_prefix + output_filename with open(output_filepath, 'w') as csvfile: fieldnames = ['publication_id', 'sentence', 'label_sequence'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) writer.writeheader() print("Tagging dataset mentions in publications...") output = extract_formatted_data(formatted_publications, citation_dict) print("Writing on new csv file...", end='') writer.writerows(output) print("DONE")
def test_set_parser(publication_txt_path_prefix, publications_json_path, data_sets_json_path, output_filename): data_set_mention_info = [] pub_date_dict = dict() print("Loading data_sets.json file...") with open(data_sets_json_path) as json_data_sets: data_sets = json.load(json_data_sets) for data_set_info in tqdm(data_sets, total=len(data_sets)): data_set_id = data_set_info.get("data_set_id", None) name = data_set_info.get("name", None) name.encode('ascii', 'ignore') date = data_set_info.get("date", None) date.encode('ascii', 'ignore') date = date[:10] if 'None' in date: date = '1800-01-01' date = int(date[:4]) * 12 * 31 + int(date[5:7]) * 31 + int( date[8:10]) mention_list = data_set_info.get("mention_list", None) formatted_mention_list = [] name = normalize_string(name) name_words = word_tokenize(name) formatted_mention_list.append( [name_words, list_to_string((name_words))]) for mention in mention_list: mention.encode('ascii', 'ignore') mention = normalize_string(mention).strip() mention = re.sub("\s\s+", " ", mention) if all(c.islower() for c in mention) and len(mention.split()) <= 2: continue # to avoid pronoun mentions like 'data', 'time' sentences = split_into_sentences(mention) words = [] for sentence in sentences: words += word_tokenize(sentence) words = [w for w in words if len(w) < 15] if len(words) > 0: formatted_mention_list.append( [words, list_to_string(words)]) data_set_mention_info.append( [date, data_set_id, formatted_mention_list]) data_set_mention_info.sort(key=lambda x: int(x[0]), reverse=True) # set prefix to formatted publication txt files formatted_txt_path_prefix = "./formatted-data/" # set path to publications.json formatted_publications = dict() print("Tokenizing publications.json file...") # open the publications.json file with open(publications_json_path) as json_publication_file: # parse it as JSON publication_list = json.load(json_publication_file) # loop over the elements in the list for publication_info in tqdm(publication_list, total=len(publication_list)): # get information on publication: publication_id = publication_info.get("publication_id", None) title = publication_info.get("title", None) title.encode('ascii', 'ignore') text_file_name = publication_info.get("text_file_name", None) unique_identifier = publication_info.get( "unique_identifier", None) # id가 bbk로 시작하면 pub_date은 None임 if 'bbk' not in unique_identifier: pub_date = publication_info.get("pub_date", None) else: pub_date = '2200-01-01' pub_date.encode('ascii', 'ignore') pub_date = int(pub_date[:4]) * 12 * 31 + int( pub_date[5:7]) * 31 + int(pub_date[8:10]) # get raw text raw_text = '' txt_file_path = publication_txt_path_prefix + text_file_name with open(txt_file_path) as txt_file: for line in txt_file: stripped_line = line.strip() raw_text += ' ' + stripped_line if len(stripped_line.split()) <= 5: raw_text += '<stop>' # marking for sentence boundary in split_into_sentences() function raw_text.encode('ascii', 'ignore') raw_text = normalize_string(raw_text) # add to formatted_publications dictionary formatted_text_list = [] chopped_raw_text = '' sentences = split_into_sentences(raw_text) for sentence in sentences: words = word_tokenize(sentence) words = [w for w in words if len(w) < 15] if len(words) >= 10 and len(words) <= 30: formatted_text_list.append(words) chopped_raw_text += ' ' + list_to_string(words) formatted_publications[publication_id] = [ formatted_text_list, chopped_raw_text.strip() ] pub_date_dict[publication_id] = pub_date # tag mentions in publication text and write in csv file output_filepath = formatted_txt_path_prefix + output_filename with open(output_filepath, 'w') as csvfile: fieldnames = [ 'publication_id', 'sentence', 'label_sequence', 'labeled' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) writer.writeheader() print("Tagging pre-found dataset mentions in publications...") output = extract_formatted_data_test(formatted_publications, data_set_mention_info, pub_date_dict) print("Writing on new csv file...", end='') writer.writerows(output) print("DONE")
def normalized_name(self): return normalize_string(self.name)
def normalized_name(self): return normalize_string(self.info["name"])
def cuevana_url(self): """ Returns the link to this episode on cuevana. """ name = self.normalized_name show = normalize_string(self.show) return urls.cuevana_url_show % (self.id, show, name)