示例#1
0
文件: eval_all.py 项目: xxx-git/ch_qa
def right_answer(test_ans, gold_ans_list):
    if test_ans in gold_ans_list:
        return True
    for gold_ans in gold_ans_list:
        if normalize_string(test_ans) in normalize_string(
                gold_ans) or normalize_string(gold_ans) in normalize_string(
                    test_ans):
            return True
    return False
示例#2
0
def normalize_vcard_contact(vcard_data: vobject.base.Component) -> ContactData:
    uid = "0"
    categories = ['uncategorized']

    if "fn" in vcard_data.contents:
        name = normalize_string(vcard_data.fn.value)
    else:
        raise RuntimeError("Contact has no name")

    if "categories" in vcard_data.contents:
        categories = vcard_data.categories.value

    contact = ContactData(uid, name, categories)

    return contact
示例#3
0
    def find_page_by_name(self, name: str, point=None, radius=None):
        """Find a single Wikipedia page given a name

        Search for the top 5 results. If there's a result with the exact same
        normalized name, choose that article. Otherwise, choose the first
        result.

        Note that even when the name is not exactly the same, the first result
        usually makes sense. For example, the first result for "Mount Rainier
        Wilderness" is "Mount Rainier National Park". And it would be ok to show
        the wikipedia page for the National Park when a user clicks on the
        wilderness polygon.

        Args:
            - name: name to search for
            - point: point in WGS84 to search around
            - radius: radius in meters to search around point
        """
        if not isinstance(name, str):
            raise TypeError('name must be str')
        if point is not None:
            if not isinstance(point, Point):
                raise TypeError('point must be of type Point')
        if radius is not None and radius > 10000:
            raise ValueError('max radius 10,000 meters')

        if point is None:
            res = wikipedia.search(name, results=5)
        else:
            lon, lat = list(point.coords)[0]
            radius = 400 if radius is None else radius
            res = wikipedia.geosearch(
                latitude=lat,
                longitude=lon,
                title=name,
                results=5,
                radius=radius)

        exact_match = [
            ind for ind, s in enumerate(res)
            if normalize_string(s) == normalize_string(name)
        ]
        choice = None
        if exact_match != []:
            choice = exact_match[0]
        else:
            choice = 0

        # Here turn auto_suggest to False because I know the page name exists
        # Otherwise sometimes the page names redirect from a name that exists to
        # a name that does not. For example, searching
        # ```
        # wikipedia.page('Shasta-Trinity National Forest')
        # ```
        # raises `PageError: Page id "shasta trinity national forests" does not
        # match any pages. Try another id!`, while the page does exist:
        # https://en.wikipedia.org/wiki/Shasta%E2%80%93Trinity_National_Forest
        # See also
        # https://github.com/goldsmith/Wikipedia/issues/192
        # https://github.com/goldsmith/Wikipedia/issues/176
        return wikipedia.page(res[choice], auto_suggest=False)
示例#4
0
def tensors_from_pair(input_lang, output_lang, pair):
    input_tensor = tensor_from_sentence(input_lang, pair[1])
    target_tensor = tensor_from_sentence(output_lang, pair[0])
    return (input_tensor, target_tensor)


if __name__ == "__main__":
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    with open('input.pickle', 'rb') as f:
        input_lang = pickle.load(f)
    with open('target.pickle', 'rb') as f:
        target_lang = pickle.load(f)
    with open('../assets/SMSSpamCollection.txt') as f:
        lines = f.readlines()
        pairs = [[normalize_string(s) for s in line.split('\t')]
                 for line in lines]

    # modelのロード
    hidden_size = 256
    model = RNN(input_lang.n_words, target_lang.n_words,
                hidden_size).to(device)
    param = torch.load("model_data/model4.pth")
    for p in model.parameters():
        print(p)
    model.load_state_dict(param)
    print("-" * 50)
    for p in model.parameters():
        print(p)

    input_tensor = tensor_from_sentence(input_lang, pairs[1][1]).to(device)
示例#5
0
def train_set_parser(publication_txt_path_prefix, publications_json_path,
                     data_set_citations_json_path, data_sets_json_path,
                     output_filename):
    citation_dict = dict()
    print("Loading data_set_citations.json file...")
    # open the publications.json file
    with open(data_set_citations_json_path) as json_data_set_citations:
        # parse it as JSON
        data_set_citations = json.load(json_data_set_citations)
        # loop
        for citaion_info in tqdm(data_set_citations,
                                 total=len(data_set_citations)):
            publication_id = citaion_info.get("publication_id", None)
            data_set_id = citaion_info.get("data_set_id", None)
            mention_list = citaion_info.get("mention_list", None)
            formatted_mention_list = []
            for mention in mention_list:
                mention.encode('ascii', 'ignore')
                mention = normalize_string(mention)
                sentences = split_into_sentences(mention)
                words = []
                for sentence in sentences:
                    words += word_tokenize(sentence)
                words = [w for w in words if len(w) < 15]
                if len(words) > 0:
                    formatted_mention_list.append(words)
            if publication_id in citation_dict:
                citation_dict[publication_id].append(
                    [data_set_id, formatted_mention_list])
            else:
                citation_dict[publication_id] = [[
                    data_set_id, formatted_mention_list
                ]]
    # set prefix to formatted publication txt files
    formatted_txt_path_prefix = "./formatted-data/"
    # set path to publications.json
    formatted_publications = dict()
    print("Tokenizing publication files...")
    # open the publications.json file
    with open(publications_json_path) as json_publication_file:
        # parse it as JSON
        publication_list = json.load(json_publication_file)
        # loop over the elements in the list
        for publication_info in tqdm(publication_list,
                                     total=len(publication_list)):
            # get information on publication:
            publication_id = publication_info.get("publication_id", None)
            text_file_name = publication_info.get("text_file_name", None)
            # get raw text
            raw_text = ''
            txt_file_path = publication_txt_path_prefix + text_file_name
            with open(txt_file_path) as txt_file:
                for line in txt_file:
                    stripped_line = line.strip()
                    raw_text += ' ' + stripped_line
                    if len(stripped_line.split()) <= 5:
                        raw_text += '<stop>'  # marking for sentence boundary in split_into_sentences() function
            raw_text.encode('ascii', 'ignore')
            raw_text = normalize_string(raw_text)
            # add to formatted_publications dictionary
            formatted_text_list = []
            sentences = split_into_sentences(raw_text)
            for sentence in sentences:
                words = word_tokenize(sentence)
                words = [w for w in words if len(w) < 15]
                if len(words) >= 10 and len(words) <= 30:
                    formatted_text_list.append(words)
            formatted_publications[publication_id] = formatted_text_list
    # tag mentions in publication text and write in csv file
    output_filepath = formatted_txt_path_prefix + output_filename
    with open(output_filepath, 'w') as csvfile:
        fieldnames = ['publication_id', 'sentence', 'label_sequence']
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                quoting=csv.QUOTE_ALL)
        writer.writeheader()
        print("Tagging dataset mentions in publications...")
        output = extract_formatted_data(formatted_publications, citation_dict)
        print("Writing on new csv file...", end='')
        writer.writerows(output)
        print("DONE")
示例#6
0
def test_set_parser(publication_txt_path_prefix, publications_json_path,
                    data_sets_json_path, output_filename):
    data_set_mention_info = []
    pub_date_dict = dict()
    print("Loading data_sets.json file...")
    with open(data_sets_json_path) as json_data_sets:
        data_sets = json.load(json_data_sets)
        for data_set_info in tqdm(data_sets, total=len(data_sets)):
            data_set_id = data_set_info.get("data_set_id", None)
            name = data_set_info.get("name", None)
            name.encode('ascii', 'ignore')
            date = data_set_info.get("date", None)
            date.encode('ascii', 'ignore')
            date = date[:10]
            if 'None' in date:
                date = '1800-01-01'
            date = int(date[:4]) * 12 * 31 + int(date[5:7]) * 31 + int(
                date[8:10])
            mention_list = data_set_info.get("mention_list", None)
            formatted_mention_list = []
            name = normalize_string(name)
            name_words = word_tokenize(name)
            formatted_mention_list.append(
                [name_words, list_to_string((name_words))])
            for mention in mention_list:
                mention.encode('ascii', 'ignore')
                mention = normalize_string(mention).strip()
                mention = re.sub("\s\s+", " ", mention)
                if all(c.islower()
                       for c in mention) and len(mention.split()) <= 2:
                    continue  # to avoid pronoun mentions like 'data', 'time'
                sentences = split_into_sentences(mention)
                words = []
                for sentence in sentences:
                    words += word_tokenize(sentence)
                words = [w for w in words if len(w) < 15]
                if len(words) > 0:
                    formatted_mention_list.append(
                        [words, list_to_string(words)])
            data_set_mention_info.append(
                [date, data_set_id, formatted_mention_list])
    data_set_mention_info.sort(key=lambda x: int(x[0]), reverse=True)
    # set prefix to formatted publication txt files
    formatted_txt_path_prefix = "./formatted-data/"
    # set path to publications.json
    formatted_publications = dict()
    print("Tokenizing publications.json file...")
    # open the publications.json file
    with open(publications_json_path) as json_publication_file:
        # parse it as JSON
        publication_list = json.load(json_publication_file)
        # loop over the elements in the list
        for publication_info in tqdm(publication_list,
                                     total=len(publication_list)):
            # get information on publication:
            publication_id = publication_info.get("publication_id", None)
            title = publication_info.get("title", None)
            title.encode('ascii', 'ignore')
            text_file_name = publication_info.get("text_file_name", None)
            unique_identifier = publication_info.get(
                "unique_identifier", None)  # id가 bbk로 시작하면 pub_date은 None임
            if 'bbk' not in unique_identifier:
                pub_date = publication_info.get("pub_date", None)
            else:
                pub_date = '2200-01-01'
            pub_date.encode('ascii', 'ignore')
            pub_date = int(pub_date[:4]) * 12 * 31 + int(
                pub_date[5:7]) * 31 + int(pub_date[8:10])
            # get raw text
            raw_text = ''
            txt_file_path = publication_txt_path_prefix + text_file_name
            with open(txt_file_path) as txt_file:
                for line in txt_file:
                    stripped_line = line.strip()
                    raw_text += ' ' + stripped_line
                    if len(stripped_line.split()) <= 5:
                        raw_text += '<stop>'  # marking for sentence boundary in split_into_sentences() function
            raw_text.encode('ascii', 'ignore')
            raw_text = normalize_string(raw_text)
            # add to formatted_publications dictionary
            formatted_text_list = []
            chopped_raw_text = ''
            sentences = split_into_sentences(raw_text)
            for sentence in sentences:
                words = word_tokenize(sentence)
                words = [w for w in words if len(w) < 15]
                if len(words) >= 10 and len(words) <= 30:
                    formatted_text_list.append(words)
                    chopped_raw_text += ' ' + list_to_string(words)
            formatted_publications[publication_id] = [
                formatted_text_list,
                chopped_raw_text.strip()
            ]
            pub_date_dict[publication_id] = pub_date
    # tag mentions in publication text and write in csv file
    output_filepath = formatted_txt_path_prefix + output_filename
    with open(output_filepath, 'w') as csvfile:
        fieldnames = [
            'publication_id', 'sentence', 'label_sequence', 'labeled'
        ]
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                quoting=csv.QUOTE_ALL)
        writer.writeheader()
        print("Tagging pre-found dataset mentions in publications...")
        output = extract_formatted_data_test(formatted_publications,
                                             data_set_mention_info,
                                             pub_date_dict)
        print("Writing on new csv file...", end='')
        writer.writerows(output)
        print("DONE")
示例#7
0
文件: api.py 项目: jose2190/guicavane
 def normalized_name(self):
     return normalize_string(self.name)
示例#8
0
文件: api.py 项目: jose2190/guicavane
 def normalized_name(self):
     return normalize_string(self.info["name"])
示例#9
0
文件: api.py 项目: jose2190/guicavane
    def cuevana_url(self):
        """ Returns the link to this episode on cuevana. """

        name = self.normalized_name
        show = normalize_string(self.show)
        return urls.cuevana_url_show % (self.id, show, name)