Пример #1
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=True,
                 never_split=None,
                 additional_special_tokens=[
                     "[JJR]", "[JJS]", "[NNS]", "[NNP]", "[NNPS]", "[RBR]",
                     "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]"
                 ],
                 **kwargs):
        self.inflection_tokens = additional_special_tokens
        self.tagger = PerceptronTagger()
        super().__init__(vocab_file,
                         do_lower_case=do_lower_case,
                         never_split=never_split,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        self.have_inflections = {'NOUN', 'ADJ', 'VERB'}
        self.lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"}
        self.do_lower_case = do_lower_case
        if do_lower_case:
            self.cased_tokenizer = BasicTokenizer(do_lower_case=False,
                                                  never_split=never_split)
        else:
            self.cased_tokenizer = self.basic_tokenizer
Пример #2
0
def format_subreddit(subreddit):
    tokenizer = BasicTokenizer(do_lower_case=True)
    with open(LOGS + 'vocabs/vocab_map.json', 'r') as infile:
        d = json.load(infile)
    vocab = set(d.keys())
    line_number = 0
    outfile = open(OUTPUT + subreddit, 'w')
    writer = csv.writer(outfile, delimiter=',')
    subreddit_file = ROOT + 'subreddits_month/' + subreddit + '/RC_sample'
    with open(subreddit_file, 'r') as infile:
        for line in infile:
            contents = line.strip()
            if contents.startswith('USER1USER0USER'):
                curr_user = contents
            else:
                sent_tok = sent_tokenize(contents)
                for sent in sent_tok:
                    tokens = tokenizer.tokenize(sent.strip())
                    for i, word in enumerate(tokens):
                        if word in vocab:
                            lh = ' '.join(tokens[:i])
                            rh = ' '.join(tokens[i + 1:])
                            writer.writerow(
                                [line_number, curr_user, lh, word, rh])
            line_number += 1
    outfile.close()
Пример #3
0
def format_n_examples(word):
    with open(INPUT_LOGS + 'vocabs/vocab_map.json', 'r') as infile:
        d = json.load(infile)
    tokenizer = BasicTokenizer(do_lower_case=True)
    ID = d[word]
    doc = INPUT_LOGS + 'vocabs/docs/' + str(ID)
    curr_user = None
    line_number = 0
    outfile = open(OUTPUT + str(ID), 'w')
    writer = csv.writer(outfile, delimiter=',')
    with open(doc, 'r') as infile:
        for line in infile:
            contents = line.strip()
            if contents.startswith('USER1USER0USER'):
                curr_user = contents
            else:
                sent_tok = sent_tokenize(contents)
                for sent in sent_tok:
                    tokens = tokenizer.tokenize(sent.strip())
                    if word in tokens:
                        i = tokens.index(word)
                        lh = ' '.join(tokens[:i])
                        rh = ' '.join(tokens[i + 1:])
                        writer.writerow([line_number, curr_user, lh, word, rh])
                        break
            line_number += 1
    outfile.close()
Пример #4
0
def get_sr2terms_no_mwes(): 
    tokenizer = BasicTokenizer(do_lower_case=True)
    sr2terms = defaultdict(list)
    num_mwes = 0
    with open(TERMS, 'r') as infile: 
        reader = csv.DictReader(infile, delimiter=',')
        for row in reader:
            term = row['term'].strip().lower()
            # ignore MWEs
            if len(tokenizer.tokenize(term)) > 1: 
                continue
            sr2terms[row['subreddit']].append(term)
    return sr2terms
Пример #5
0
def get_vocab_word_instances(line, vocab=None):
    '''
    Used by sample_word_instances()
    to get a flatmap from each comment to (vocab word, [comment])
    '''
    tokenizer = BasicTokenizer(do_lower_case=True)
    line = line.strip()
    tokens = set(tokenizer.tokenize(line))
    ret = []
    union = tokens & set(vocab.keys())
    for w in union: 
        ret.append((w, [line]))
    if len(union) == 0: 
        ret.append((None, [line]))
    return ret 
Пример #6
0
def get_all_examples_with_word(word): 
    '''
    This function gathers all examples that contain a word, 
    to use as input to Amrami & Goldberg's model. 
    '''
    rdds = []
    for folder in os.listdir(SR_FOLDER_MONTH): 
        path = SR_FOLDER_MONTH + folder + '/RC_sample'
        data = sc.textFile(path) 
        data = data.filter(lambda line: not line.startswith('USER1USER0USER'))
        tokenizer = BasicTokenizer(do_lower_case=True)
        data = data.filter(lambda line: word in set(tokenizer.tokenize(line.strip())))
        rdds.append(data)
    all_occ = sc.union(rdds)
    all_occ.coalesce(1).saveAsTextFile(AMRAMI_INPUT + word)
Пример #7
0
def sample_vocab_lines(tup): 
    '''
    This function samples words 500 times. 
    It initially samples a larger number of words, 
    but then removes cases where the examples have many
    repetitions (such as comments written by bots). 
    
    Used by sample_word_instances()
    '''
    w = tup[0]
    lines = tup[1]
    sample_num = 25000
    if w == 'compose': # special case that occurs often
        sample_num = 300000
    instances = random.sample(lines, min(sample_num, len(lines)))
    tokenizer = BasicTokenizer(do_lower_case=True)
    comment2windows = {} # comment idx to window IDs
    windowIDs = {} # window list to window ID
    ID_counts = Counter() # window ID to count
    for i, inst in enumerate(instances): 
        lh, _, rh = inst.partition(w)
        ltokens = tokenizer.tokenize(lh)
        rtokens = tokenizer.tokenize(rh)
        ltokens = ltokens[-5:]
        rtokens = rtokens[:5]
        window = ltokens + [w] + rtokens
        if tuple(window) in windowIDs: 
            windowID = windowIDs[tuple(window)]
        else: 
            windowID = i 
            windowIDs[tuple(window)] = i
        comment2windows[i] = windowID 
        ID_counts[windowID] += 1
    new_instances = []
    for i, inst in enumerate(instances): 
        windowID = comment2windows[i]
        c = ID_counts[windowID]
        if c < 10: 
            new_instances.append(inst)
        if len(new_instances) == 500: break
    
    if len(new_instances) < 500 and len(lines) <= 20000: 
        print("Error: Not enough samples for word:", w)
        new_instances = []
    elif len(new_instances) < 500: 
        print("Error: Need to initially sample more comments for word:", w, len(lines), len(new_instances))
        new_instances = []
    return (w, new_instances)
Пример #8
0
    def __init__(
        self,
        model: Union["PreTrainedModel", "TFPreTrainedModel"],
        tokenizer: PreTrainedTokenizer,
        modelcard: Optional[ModelCard] = None,
        framework: Optional[str] = None,
        args_parser: ArgumentHandler = TokenClassificationArgumentHandler(),
        device: int = -1,
        binary_output: bool = False,
        ignore_labels=["O"],
        task: str = "",
        ignore_special_tokens: bool = True,
        grouped_entities: bool = False,
        ignore_subwords: bool = False,
    ):
        super().__init__(
            model=model,
            tokenizer=tokenizer,
            modelcard=modelcard,
            framework=framework,
            args_parser=args_parser,
            device=device,
            binary_output=binary_output,
            task=task,
        )

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
        self._args_parser = args_parser
        self.ignore_labels = ignore_labels
        self.ignore_special_tokens = ignore_special_tokens
        self.grouped_entities = grouped_entities
        self.ignore_subwords = ignore_subwords
Пример #9
0
    def _bert_basic(self, text):
        if self.word_tokenizer is None:
            from transformers import BasicTokenizer

            self.word_tokenizer = BasicTokenizer(**self.config)

        return self.word_tokenizer.tokenize(text)
Пример #10
0
    def __init__(self,
                 data_path,
                 data_split,
                 vocab,
                 load_img=True,
                 img_dim=2048):
        self.vocab = vocab
        vocab_pre = pickle.load(
            open(os.path.join(data_path, 'vocab.pkl'), 'rb'))
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        #, additional_special_tokens=list(vocab_pre.word2idx.keys()))

        self.basic_tokenizer = BasicTokenizer()
        # captions
        self.captions = list()
        with open(os.path.join(data_path, f'{data_split}_caps.txt'), 'r') as f:
            for line in f:
                # TODO use bert tokenizer?????
                self.captions.append('[CLS] ' + line.strip().lower() +
                                     ' [SEP]')
        self.length = len(self.captions)

        # image features
        if load_img:
            self.images = np.load(
                os.path.join(data_path, f'{data_split}_ims.npy'))
        else:
            self.images = np.zeros((self.length // 5, img_dim))
        # each image can have 1 caption or 5 captions
        if self.images.shape[0] != self.length:
            self.im_div = 5
            assert self.images.shape[0] * 5 == self.length
        else:
            self.im_div = 1
Пример #11
0
class MecabTokenizer(object):
    def __init__(self):
        self.mecab = MeCab.Tagger(
            f"--dicdir /usr/local/lib/mecab/dic/mecab-ko-dic")
        # Split punctuation & Tokenize Chinese Character & Clean Text
        self.basic_tokenizer = BasicTokenizer(do_lower_case=False,
                                              tokenize_chinese_chars=True)

    def tokenize(self, text: str):
        text = " ".join(self.basic_tokenizer.tokenize(text))
        text_ptr = 0
        is_first_token = True
        tokenized = []

        for morph in self.mecab.parse(text).split("\n"):
            if "\t" in morph:
                token = morph.split("\t")[0]

                # If space token, increment text_ptr
                if text[text_ptr] == " ":
                    while text[text_ptr] == " ":
                        text_ptr += 1
                    is_first_token = True  # Reset that it is first token

                text_ptr += len(token)

                if is_first_token:
                    is_first_token = False
                else:
                    token = "##" + token

                tokenized.append(token)
        return tokenized
Пример #12
0
    def __init__(
            self,
            model_path: str,  # new
            tokenizer: PreTrainedTokenizer,
            modelcard: Optional[ModelCard] = None,
            framework: Optional[str] = None,
            args_parser: ArgumentHandler = TokenClassificationArgumentHandler(
            ),
            device: int = -1,
            binary_output: bool = False,
            ignore_labels=["O"],
            task: str = "",
            grouped_entities: bool = False,
            ignore_subwords: bool = False,
            idx2label: List = None,  # new
    ):
        super().__init__(
            model_path=model_path,
            tokenizer=tokenizer,
            device=device,
            binary_output=binary_output,
        )

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
        self._args_parser = args_parser
        self.ignore_labels = ignore_labels
        self.grouped_entities = grouped_entities
        self.ignore_subwords = ignore_subwords
        self.idx2label = idx2label

        if self.ignore_subwords and not self.tokenizer.is_fast:
            raise ValueError(
                "Slow tokenizers cannot ignore subwords. Please set the `ignore_subwords` option"
                "to `False` or use a fast tokenizer.")
Пример #13
0
def filter_ukwac(): 
    """
    For every sentence, only keep the ones in which a targeted lemma appears. 
    Format it as 
    lemma \t target word \t sentence
    for easy BERT processing :)
    
    ukwac is the training data for the SemEval 2013 WSI task. 
    """
    directory = ROOT + 'SemEval-2013-Task-13-test-data/contexts/xml-format/'
    target_set = set()
    for f in os.listdir(directory): 
        if f.endswith('.xml'): 
            target_set.add(f.replace('.xml', ''))
    data = sc.textFile(DATA + 'ukwac_preproc')
    data = data.filter(lambda line: not line.startswith("CURRENT URL "))
    tokenizer = BasicTokenizer(do_lower_case=True) 
    data = data.flatMap(partial(sentences_with_target_words, tokenizer=tokenizer, 
        target_set=target_set))
    data = data.sample(False,0.05,0)
    counts = data.map(count_ukwac_lemmas) 
    counts = counts.reduceByKey(lambda n1, n2: n1 + n2)
    counts = counts.collectAsMap()
    data = data.collect()
    with open(LOGS + 'ukwac2.txt', 'w') as outfile: 
        for item in data: 
            outfile.write(item[0] + '\t' + item[1] + '\t' + item[2] + '\n')
    for lemma in counts: 
        print(lemma, counts[lemma])
Пример #14
0
def build_from_p_e_m_file(p_e_m_file, dump_db_file, wiki_mention_db_file,
                          **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = BasicTokenizer(do_lower_case=False)
    normalizer = BertLowercaseNormalizer()
    wiki_mention_db = MentionDB(wiki_mention_db_file)
    MentionDB.build_from_p_e_m_file(p_e_m_file, dump_db, wiki_mention_db,
                                    tokenizer, normalizer, **kwargs)
Пример #15
0
class CustomTokenizer(object):
    def __init__(self, vocab):
        self.tokenizer = BasicTokenizer(do_lower_case=False)
        self.vocab = vocab

    def __call__(self, text):
        words = self.tokenizer.tokenize(text.strip())
        return Doc(self.vocab, words=words)
Пример #16
0
def main():
    filename = sys.argv[1]
    outpath = filename.replace('subreddits_month', 'subreddits3') + '.conll'
    outfile = open(outpath, 'w')
    tokenizer = BasicTokenizer(do_lower_case=True)
    vocab = set()
    with open(filename, 'r') as infile:
        for line in infile:
            if line.startswith('USER1USER0USER'):
                if len(vocab) > 0:
                    outfile.write('\n'.join(list(vocab)) + '\n')
                vocab = set()
            else:
                tokens = tokenizer.tokenize(line.strip())
                vocab.update(tokens)
        if len(vocab) > 0:
            outfile.write('\n'.join(list(vocab)) + '\n')
    outfile.close()
Пример #17
0
def basic_stats(): 
    '''
    Number of subreddits, number of terms per subreddit
    
    Check that all subreddits with links have glossary standardized terms
    and that subreddit names match up in these files. 
    '''
    tokenizer = BasicTokenizer(do_lower_case=True)
    sr_list = []
    with open(SR_LIST, 'r') as infile: 
        reader = csv.DictReader(infile, delimiter=',')
        for row in reader:
            if row['glossary'].strip() != '': 
                sr_list.append(row['subreddit_name'])
    sr_list = sorted(sr_list)
    sr_set = set()
    sr_terms = defaultdict(set)
    mwe_set = defaultdict(set)
    total_terms = 0
    num_mwes = 0
    with open(TERMS, 'r') as infile: 
        reader = csv.DictReader(infile, delimiter=',')
        for row in reader:
            sr_set.add(row['subreddit'].strip())
            term = row['term'].strip().lower()
            sr_terms[row['subreddit']].add(term)
            term_len = len(tokenizer.tokenize(term))
            if term_len > 1: 
                num_mwes += 1
                mwe_set[term_len].add(term)
    assert sorted(sr_list) == sorted(sr_set)
    sr_count = Counter()
    for sr in sr_terms: sr_count[sr] = len(sr_terms[sr])
    mwe_count = Counter()
    for term_len in mwe_set: mwe_count[term_len] = len(mwe_set[term_len])
    print("Number of subreddits:", len(sr_set))
    print("Average number of terms per subreddit:", np.mean(list(sr_count.values())))
    print("Min number of terms:", np.min(list(sr_count.values())))
    print("Max number of terms:", np.max(list(sr_count.values())))
    print("Total number of terms:", sum(list(sr_count.values())))
    print("Total number of non-unique mwes:", num_mwes)
    print("Total number of unique mwes:", sum(list(mwe_count.values())))
    for term_len in sorted(mwe_count.keys()): 
        print(term_len, mwe_count[term_len])
Пример #18
0
def tokenizer_check(): 
    '''
    This function was used to compare two tokenizers. 
    
    The main conclusion after Lucy ran this function was that
    the BasicTokenizer is the BertTokenizer with the wordpieces
    connected together. 
    '''
    path = SR_FOLDER_MONTH + 'askreddit/RC_sample'
    data = sc.textFile(path)
    sample = data.takeSample(False, 100) 
    tokenizer1 = BasicTokenizer(do_lower_case=True)
    tokenizer2 = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    success = True
    for s in sample: 
        tokens1 = tokenizer1.tokenize(s)
        tokens2 = tokenizer2.tokenize(s)
        prev_word = None
        tokens3 = []
        ongoing_word = []
        for w in tokens2: 
            if w.startswith('##'): 
                if not prev_word.startswith('##'): 
                    ongoing_word.append(prev_word)
                ongoing_word.append(w[2:])
            else: 
                if len(ongoing_word) == 0 and prev_word is not None: 
                    tokens3.append(prev_word)
                elif prev_word is not None:
                    tokens3.append(''.join(ongoing_word))
                ongoing_word = []
            prev_word = w
        if len(ongoing_word) == 0 and prev_word is not None: 
            tokens3.append(prev_word)
        elif prev_word is not None: 
            tokens3.append(''.join(ongoing_word))
        if tokens3 != tokens1: 
            print("OH NOOOOOOOOOO")
            print(tokens1)
            print(tokens3) 
            success = False
    if success: 
        print("TOKENS MATCHED UP!")
Пример #19
0
class BertLowercaseNormalizer(object):
    def __init__(self,
                 never_lowercase=("[UNK]", "[SEP]", "[PAD]", "[CLS]",
                                  "[MASK]")):
        self._tokenizer = BasicTokenizer()
        self._never_lowercase = frozenset(never_lowercase)

    def normalize(self, token):
        if token not in self._never_lowercase:
            token = token.lower()
            token = self._tokenizer._run_strip_accents(token)
        return token
Пример #20
0
    def __init__(self, from_pretrained=None):
        fin = None
        self.str2idx = {" " : 0, "<s>" : 1, "</s>" : 2, "<sep>": 3, "<mask>": 4, "<pad>": 5}
        self.idx2str = ["", "<s>", "</s>", "<sep>", "<mask>", "<pad>"]
        if from_pretrained is not None:
            try:
                fin = open(from_pretrained, "rb")
            except:
                fin = None

            if fin is not None:
                self.str2idx, self.idx2str = pickle.load(fin)
                fin.close()
            else:
                print("Warning: pretrained file at location \"%s\" not found." % from_pretrained)

        self.basic_tokenizer = BasicTokenizer(do_lower_case=False, never_split=["<mask>", "</s>", "<EOT>", "<EOL>", "<sep>", "<mask>", "<pad>"])
        self.bos_token_id = 1
        self.eos_token_id = 2
        self.sep_token_id = 3
        self.mask_token_id = 4
        self.pad_token_id = 5
Пример #21
0
def main():
    parser = argparse.ArgumentParser(
        description=\
            'Formats a Wikipedia dump (.xml.bz2 archive).'
    )
    parser.add_argument(
        '--archive_path',
        required=True,
        type=str,
        help='Path to Wikipedia dump to format.',
    )
    parser.add_argument(
        '--delete_document_level_corpus',
        action='store_true',
        help=
        'Whether to keep both the formatted and original document-level corpora.',
    )
    args = parser.parse_args()

    formatter = WikipediaCorpusFormatter(archive_path=args.archive_path)
    logging.info('Preparing to format Wikipedia dump using parameters:')
    logging.info(' * archive_path: %s', args.archive_path)

    # NOTE: this is an extraction step (dump -> single .txt file with documents)
    fpath = formatter.format_as_one_document_per_line()
    if fpath is None:
        return  # Aborting formatting as formatted corpus already exists

    # NOTE: this is the actual formatting step:
    # --> one tokenized sentence per line + blank line between documents
    logging.info('Formatting extracted documents...')
    tokenizer = BasicTokenizer()
    split_into_sentences = nltk.tokenize.sent_tokenize
    split_into_tokens = tokenizer.tokenize
    input_file_path = fpath
    output_file_path = fpath.replace('.txt', '.formatted.txt')
    with open(input_file_path, "r", encoding="utf-8") as input_file:
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            for line in tqdm(input_file, desc='Segmenting corpus'):
                if line.strip():  # if document
                    sentences = split_into_sentences(line)
                    for sentence in sentences:
                        tokens = split_into_tokens(sentence.strip())
                        new_line = ' '.join(tokens) + '\n'
                        output_file.write(new_line)
                else:  # if blank line
                    output_file.write('\n')

    if args.delete_document_level_corpus:
        logging.info('Deleting document level corpus...')
        os.remove(input_file_path)
Пример #22
0
def load_classification_dataset(step, do_lower_case):
    """ Loads classification exampels from a dataset. """
    assert step in ['train', 'test']
    basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
    path = os.path.join(DATA_PATH, 'classification', f'{step}.txt')
    examples = []
    with open(path, 'r', encoding='utf-8') as data_file:
        lines = data_file.readlines()
        for i, line in tqdm(enumerate(lines),
                            desc=f'reading `{os.path.basename(path)}`...'):
            # example: __label__negative I don't like tomatoes.
            splitline = line.strip().split()
            label = splitline[0].split('__label__')[-1]
            tokens = ' '.join(splitline[1:])
            examples.append(
                ClassificationExample(
                    id=i,
                    tokens_a=basic_tokenizer.tokenize(tokens),
                    tokens_b=None,
                    label=label,
                ))
    logging.info('Number of `%s` examples: %d', step, len(examples))
    return examples
Пример #23
0
def est_finetuning_gloss_cov(): 
    """
    For words in subreddit glossaries, calculate how many times
    they appear in the finetuning input. 
    
    This was used during domain adaptation to check that glossary words
    are in fact being seen by the model. 
    
    Our paper does not include the domain adapted model. 
    """
    data = sc.textFile(LOGS + 'finetune_input_train2')
    sr2terms = get_sr2terms()
    terms = set()
    for sr in sr2terms: 
        terms.update(sr2terms[sr])
    tokenizer = BasicTokenizer(do_lower_case=True) 
    data = data.flatMap(lambda line: list(set(tokenizer.tokenize(line)) & terms))
    data = data.map(lambda w: (w, 1))
    data = data.reduceByKey(lambda n1, n2: n1 + n2)
    data = data.collectAsMap()
    gloss_examples = Counter(data)
    avg = sum(gloss_examples.values()) / len(gloss_examples)
    print("------RESULT median # of examples per gloss word:", np.median(list(gloss_examples.values())))
    print("------RESULT avg # of examples per gloss word:", avg)
Пример #24
0
def load_sequence_labelling_dataset(step, do_lower_case):
    """ Loads sequence labelling examples from a dataset. """
    assert step in ['train', 'test']
    path = os.path.join(DATA_PATH, 'sequence_labelling', f'{step}.txt')
    i = 0
    examples = []
    with open(path, 'r', encoding='utf-8') as data_file:
        lines = data_file.readlines()
        token_sequence = []
        label_sequence = []
        for line in tqdm(lines, desc=f'reading `{os.path.basename(path)}`...'):
            # example:
            #          My O
            #          name O
            #          is O
            #          Hicham B-PER
            #          . O
            splitline = line.strip().split()
            if splitline:
                token, label = splitline
                token_sequence.append(token)
                label_sequence.append(label)
            else:
                examples.append(
                    SequenceLabellingExample(
                        id=i,
                        token_sequence=token_sequence,
                        label_sequence=label_sequence,
                    ))
                i += 1
                token_sequence = []
                label_sequence = []

    # Don't forget to add the last example
    if token_sequence:
        examples.append(
            SequenceLabellingExample(
                id=i,
                token_sequence=token_sequence,
                label_sequence=label_sequence,
            ))

    retokenize(examples,
               tokenization_function=BasicTokenizer(
                   do_lower_case=do_lower_case).tokenize)
    logging.info('Number of `%s` examples: %d', step, len(examples))
    return examples
Пример #25
0
    def __init__(self,
                 model,
                 tokenizer: PreTrainedTokenizer = None,
                 modelcard: ModelCard = None,
                 framework: Optional[str] = None,
                 args_parser: ArgumentHandler = None,
                 device: int = -1,
                 binary_output: bool = False,
                 ignore_labels=['O']):
        super().__init__(model=model,
                         tokenizer=tokenizer,
                         modelcard=modelcard,
                         framework=framework,
                         args_parser=args_parser,
                         device=device,
                         binary_output=binary_output)

        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
        self.ignore_labels = ignore_labels
Пример #26
0
def main():
    """
    Undirected, unweighted graph with parameters from original Textrank paper
    """
    tokenizer = BasicTokenizer(do_lower_case=True)
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    nlp.tokenizer = CustomTokenizer(nlp.vocab)
    article = sys.argv[1]

    print("creating graph for", article)
    G = nx.Graph()

    infile = open(INFOLDER + article + '/RC_sample', 'r')
    for line in infile:
        if line.startswith('USER1USER0USER'): continue
        doc = nlp(line)
        comment_toks = []
        for token in doc:
            if token.pos_ in set(['ADJ', 'NOUN', 'PROPN']):
                comment_toks.append(token.text.lower())

        # add edges to graph using window size of 2
        for i in range(len(comment_toks) - 1):
            w1 = comment_toks[i]
            w2 = comment_toks[i + 1]
            if not G.has_edge(w1, w2):
                G.add_edge(w1, w2)
    infile.close()
    pagerank_scores = nx.pagerank(G, alpha=0.85, tol=0.0001)
    outfile = open(OUTFOLDER + article, 'w')
    writer = csv.writer(outfile)
    writer.writerow(['word', 'textrank'])
    vals = sorted(pagerank_scores.items(), key=lambda item: item[1])
    for tup in vals:
        writer.writerow([tup[0], tup[1]])
    outfile.close()
Пример #27
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""
    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            print("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            print("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Пример #28
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heruistic between
    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        # ns_text: 原始文本去掉所有空格 ns_to_s_map:原始文本中字符的位置与ns_text中字符的映射关系
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    # 基本的分词器,会分开空格和标点
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    # 在原始答案中寻找预测答案
    start_position = tok_text.find(pred_text)
    if start_position == -1:  # 如果找不到直接返回原始答案
        if verbose_logging:
            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1
    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(
            tok_ns_text):  # 如果原始答案与原始答案分词再组合后的去空格长度不相等,直接返回原始答案
        if verbose_logging:
            logging.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            print("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            print("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
Пример #29
0
    def __init__(self,
                 vocab_file,
                 do_lower_case=False,
                 do_word_tokenize=True,
                 do_subword_tokenize=True,
                 word_tokenizer_type="basic",
                 subword_tokenizer_type="wordpiece",
                 never_split=None,
                 unk_token="[UNK]",
                 sep_token="[SEP]",
                 pad_token="[PAD]",
                 cls_token="[CLS]",
                 mask_token="[MASK]",
                 mecab_kwargs=None,
                 **kwargs):
        """Constructs a MecabBertTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input.
                Only has an effect when do_basic_tokenize=True.
            **do_word_tokenize**: (`optional`) boolean (default True)
                Whether to do word tokenization.
            **do_subword_tokenize**: (`optional`) boolean (default True)
                Whether to do subword tokenization.
            **word_tokenizer_type**: (`optional`) string (default "basic")
                Type of word tokenizer.
            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
                Type of subword tokenizer.
            **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
        """
        super(BertTokenizer, self).__init__(
            unk_token='<unk>' if word_tokenizer_type == 'sp' else '[UNK]',
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            do_lower_case=do_lower_case,
            do_word_tokenize=do_word_tokenize,
            do_subword_tokenize=do_subword_tokenize,
            word_tokenizer_type=word_tokenizer_type,
            subword_tokenizer_type=subword_tokenizer_type,
            never_split=never_split,
            mecab_kwargs=mecab_kwargs,
            **kwargs,
        )
        # ^^ We call the grandparent's init, not the parent's.

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict([
            (ids, tok) for tok, ids in self.vocab.items()
        ])

        self.do_word_tokenize = do_word_tokenize
        self.word_tokenizer_type = word_tokenizer_type
        self.lower_case = do_lower_case
        self.never_split = never_split
        self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
        if do_word_tokenize:
            if word_tokenizer_type == "basic":
                self.word_tokenizer = BasicTokenizer(
                    do_lower_case=do_lower_case,
                    never_split=never_split,
                    tokenize_chinese_chars=False)
            elif word_tokenizer_type == "mecab":
                self.word_tokenizer = MecabTokenizer(
                    do_lower_case=do_lower_case,
                    never_split=never_split,
                    **(mecab_kwargs or {}))
            elif word_tokenizer_type == "sp":
                path_vocab = Path(vocab_file)
                self.word_tokenizer = SentencePiecepTokenizer(
                    model_file=str(path_vocab.parent / path_vocab.stem) +
                    '.model',
                    do_lower_case=do_lower_case,
                    never_split=never_split,
                    **(mecab_kwargs or {}))
            else:
                raise ValueError(
                    "Invalid word_tokenizer_type '{}' is specified.".format(
                        word_tokenizer_type))

        self.do_subword_tokenize = do_subword_tokenize
        self.subword_tokenizer_type = subword_tokenizer_type
        if do_subword_tokenize:
            if subword_tokenizer_type == "wordpiece":
                self.subword_tokenizer = WordpieceTokenizer(
                    vocab=self.vocab, unk_token=self.unk_token)
            elif subword_tokenizer_type == "character":
                self.subword_tokenizer = CharacterTokenizer(
                    vocab=self.vocab, unk_token=self.unk_token)
            else:
                raise ValueError(
                    "Invalid subword_tokenizer_type '{}' is specified.".format(
                        subword_tokenizer_type))
Пример #30
0
def main():
    """Script for formatting document-level corpora."""

    parser = argparse.ArgumentParser(
        description=\
            'Formats a document-level corpus.'
    )
    parser.add_argument(
        '--document_level_corpus_path',
        required=True, type=str,
        help=\
            'Path to the document level corpus: one document per line '
            '+ blank line between documents.',
    )
    args = parser.parse_args()

    logging.info(
        'Preparing to format a document-level corpus using parameters:')
    for argname, argvalue in vars(args).items():
        logging.info(' * %s: %s', argname, argvalue)

    # Make sure document-level corpus exists
    assert os.path.exists(args.document_level_corpus_path)

    # Make output directory
    formatted_corpus_path = os.path.join(
        FORMATTED_DATA_DIRECTORY,
        os.path.basename(os.path.dirname(args.document_level_corpus_path)),
        'formatted.txt')
    os.makedirs(os.path.dirname(formatted_corpus_path), exist_ok=True)

    # Make sur output corpus does not already exist
    if os.path.exists(formatted_corpus_path):
        logging.warning('Found corpus file: %s',
                        formatted_corpus_path.replace(WORKDIR, '$WORKDIR'))
        logging.warning('Aborted formatting.')
        return

    # Tokenizer & sentence segmenter
    tokenizer = BasicTokenizer()
    split_into_sentences = nltk.tokenize.sent_tokenize
    logging.info('Using NLTK sentence segmenter.')
    split_into_tokens = tokenizer.tokenize
    logging.info('Using huggingface/transformers BasicTokenizer.')

    # Actual formatting
    logging.info('Formatting corpus...')
    input_file_path = args.document_level_corpus_path
    output_file_path = formatted_corpus_path
    n_tokens = 0
    n_sentences = 0
    with open(input_file_path, "r", encoding="utf-8") as input_file:
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            for line in tqdm(input_file, desc='Segmenting corpus'):
                if line.strip():  # if document
                    sentences = split_into_sentences(line)
                    for sentence in sentences:
                        tokens = split_into_tokens(sentence.strip())
                        new_line = ' '.join(tokens) + '\n'
                        output_file.write(new_line)
                        n_sentences += 1
                        n_tokens += len(tokens)
                else:  # if blank line
                    output_file.write('\n')

    logging.info('Done formatting.')
    logging.info('* Total number of sentences: %s', n_sentences)
    logging.info('* Total number of tokens: %s', n_tokens)