Exemplo n.º 1
0
def preprocess_text(list_text):
    '''
    Function to preprocess documents. Preprocessing includes removing symbols in replace_with_space, removing any punctuation and removing stopwords. Return a 2-d list of preprocessed text
    '''

    #variable to be used for text preprocessing function preprocess_text
    replace_with_space = re.compile('[/(){}\[\]\|@,;]')
    symbols_to_remove = re.compile("[^a-z _]+")
    stop_words = set(stopwords.words('english'))
    added_stopwords = ['one', 'says', 'like', 'said', 'say', 'would', 'go']
    stop_words = set(list(stop_words) + added_stopwords)

    #list where preprocessed text will be stored.
    preprocessed_text = []
    tknzr = TreebankWordTokenizer()
    lmtzr = WordNetLemmatizer()
    #stemmer = PorterStemmer()
    for sentence in list_text:
        text = sentence.lower()
        text = re.sub(replace_with_space, " ", text)
        text_tokens = tknzr.tokenize(text)
        text_tokens = [
            token for token in text_tokens if token not in stop_words
        ]
        text_tokens = [lmtzr.lemmatize(token) for token in text_tokens]
        text = nltk.tokenize.treebank.TreebankWordDetokenizer().detokenize(
            text_tokens)
        text = re.sub(symbols_to_remove, "", text)
        text_tokens = tknzr.tokenize(text)
        preprocessed_text.append(text_tokens)

    return preprocessed_text
    def __init__(self, rules=True):

        self.punct = set(string.punctuation).difference(set('%='))

        self.rules = rules

        self.splitters = re.compile("[-/.,|<>]")
        self.tokenizer = TreebankWordTokenizer()
Exemplo n.º 3
0
def tokenize(line):
    global tokenizer
    if args.skip_tokenization:
        return line
    if args.ptb:
        if tokenizer is None:
            tokenizer = TreebankWordTokenizer()
        return tokenizer.tokenize(line, convert_parentheses=True)
    return word_tokenize(line, language=args.language)
Exemplo n.º 4
0
 def __init__(self):
     self.letters_mappings = { u"á" : "a", 
                               u"é" : "e", 
                               u"í" : "i",
                               u"ó" : "o",
                               u"ú" : "u",
                               u"ñ" : "n",
                               u"ü" : "u" }
     self.tokenizer = TreebankWordTokenizer()
Exemplo n.º 5
0
  def test_word_tokenize_quotes(self):
    text = '"сл"'
    tokenizer = TreebankWordTokenizer()
    # _spans = nltk.word_tokenize(text)
    _spans = tokenizer.tokenize(text)

    spans = [s for s in _spans]
    print("".join(spans))
    for c in spans:
      print(len(c))
    self.assertEqual(3, len(spans))
Exemplo n.º 6
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__treebank_tokenizer = TreebankWordTokenizer()

        hunspell_dict_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'hindi-hunspell',
            'dict-hi_IN',
        )
        if not os.path.isdir(hunspell_dict_dir):
            raise McLanguageException(
                "Hunspell dictionary directory does not exist at path: %s." %
                hunspell_dict_dir)

        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')):
            raise McLanguageException(
                "Hunspell dictionary file does not exist at path: %s" %
                hunspell_dict_dir)
        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')):
            raise McLanguageException(
                "Hunspell affix file does not exist at path: %s" %
                hunspell_dict_dir)

        try:
            self.__hindi_hunspell = Hunspell(
                lang='hi_IN', hunspell_data_dir=hunspell_dict_dir)
        except Exception as ex:
            raise McLanguageException(
                "Unable to initialize Hunspell with data directory '%s': %s" %
                (
                    hunspell_dict_dir,
                    str(ex),
                ))

        # Quick self-test to make sure that Hunspell is installed and dictionary is available
        hunspell_exc_message = """
            Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g.
            you might need to fetch Git submodules by running:

                git submodule update --init --recursive
        """
        try:
            test_stems = self.stem_words(['गुरुओं'])
        except Exception as _:
            raise McLanguageException(hunspell_exc_message)
        else:
            if len(test_stems) == 0 or test_stems[0] != 'गुरु':
                raise McLanguageException(hunspell_exc_message)
Exemplo n.º 7
0
def get_tokenizer(params: dict):
    model = params.get('tokenizer', '').lower()
    if model == 'punkt':
        return WordPunctTokenizer()
    if model != '' and model != 'treebank':
        raise ModuleNotFoundError(f'No such tokenizer {model}!')
    return TreebankWordTokenizer()
Exemplo n.º 8
0
class Tokenizer:
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()

    def tokenize(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)
        return tokens
def normalize(text):
    text = text.decode('utf-8')
    text = re.sub(r'[a-zA-z]+://[^\s]*', '', text)
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)
    text = strip_accents_ascii(text)
    text = text.encode('utf-8')
    text = ' '.join(map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text)))
    return text
Exemplo n.º 10
0
def iconize_corpus(args):
    """
    This script retrives the sentences that contains
    at least one icon term
    - fdata is the current corpus
    - fembed is the icon embedding file
    """
    # load embedding terms
    embdwrds = defaultdict(str)
    embdsyns = defaultdict(str)
    with open(args.fembd, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            terms = line.split()
            term = terms[0]
            code = terms[1]
            wtype = terms[2]
            if wtype == "main":
                embdwrds[term] = code
            else:
                embdsyns[term] = code

    # filter sentences that are oov w.r.t. the embedding
    tbt = TreebankWordTokenizer()
    plist = ["..", "...", "``", "''", "."]
    with open(args.fdata, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            sen = line.lower()
            sen = ''.join(i for i in sen if ord(i) < 123)
            sen = tbt.tokenize(sen)
            sen = [x for x in sen if not x in string.punctuation]
            sen = [x for x in sen if not x in plist]
            sentence = []
            for word in sen:
                code = embdwrds[word]
                if code != "":
                    sentence.append(code)
                elif embdsyns[word] != "":
                    code = embdsyns[word]
                    sentence.append(code)
                else:  # comment for pure icon mode
                    sentence.append(word)  # pure icon mode
            sentence = str.join(" ", sentence)
            print(sentence)
Exemplo n.º 11
0
    def __init__(self, start_token: str, end_token: str, unk_token: str, num_words: int = None, max_seq_len: int = 100):
        self.treebank_word_tokenizer = TreebankWordTokenizer()
        improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
        improved_close_quote_regex = re.compile(u'([»”’])', re.U)
        improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
        self.treebank_word_tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
        self.treebank_word_tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
        self.treebank_word_tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))

        self.word_counts = OrderedDict()
        self.word_docs = {}
        self.num_words = num_words
        self.document_count = 0

        self.START_TOKEN = start_token
        self.END_TOKEN = end_token
        self.UNK_TOKEN = unk_token
        self.MAX_SEQ_LEN = max_seq_len
Exemplo n.º 12
0
def tree_bank_tokenizer():
    tokenizer = TreebankWordTokenizer()
    tokenizer.PUNCTUATION.append((re.compile(r'[/\-]'), r' \g<0> '))
    tokenizer.PUNCTUATION.append((re.compile(r'\.\.'), r' .. '))
    tokenizer.PUNCTUATION.append((re.compile(r'[\.,\+]'), r' \g<0> '))
    tokenizer.STARTING_QUOTES.append((re.compile(
        r"(')(?![sS]\s|[mM]\s|[dD]\s|ll\s|LL\s|re\s|RE\s|ve\s|VE\s|t\s|T\s|\s)"
    ), r" \1 "))
    return tokenizer
 def to_lower(self, item):
     tokenizer = TreebankWordTokenizer()
     for field in self.class_properties:
         current_field_value = getattr(item, field)
         setattr(item, field, [
             w.lower()
             for w in self.tokenizer_text(current_field_value, tokenizer)
         ])
     return item
Exemplo n.º 14
0
def tokenize(s: str) -> list:
    """
    Tokenize the given text using TreebankWordTokenizer delivered along with NLTK
    :param s: text
    :return: list of tokens
    """
    from nltk import TreebankWordTokenizer

    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(s)
    result = []
    for word in tokens:
        # the last "decode" function is because of Python3
        # http://stackoverflow.com/questions/2592764/what-does-a-b-prefix-before-a-python-string-mean
        w = unicodedata.normalize('NFKD', word).encode(
            'ascii', 'ignore').decode('utf-8').strip()
        # and add only if not empty (it happened in some data that there were empty tokens...)
        if w:
            result.append(w)

    return result
Exemplo n.º 15
0
class TokenizePreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, rules=True):

        self.punct = set(string.punctuation).difference(set('%='))

        self.rules = rules

        self.splitters = re.compile("[-/.,|<>]")
        self.tokenizer = TreebankWordTokenizer()

    def fit(self, X=None, y=None):
        return self

    @staticmethod
    def inverse_transform(X):
        return [", ".join(doc) for doc in X]

    def transform(self, X):
        return [self.token_representation(sentence) for sentence in X]

    def token_representation(self, sentence):
        return list(self.tokenize(sentence))

    def tokenize(self, sentence):
        """break sentence into pos-tagged tokens; normalize and split on hyphens"""

        # extremely short sentences shall be ignored by next steps
        if len(sentence) < MIN_LEN:
            yield "_empty_sentence_"
        else:
            for token in self.tokenizer.tokenize(sentence):
                # Apply preprocessing to the token
                token_nrm = self.normalize_token(token)
                subtokens = [
                    self.normalize_token(t)
                    for t in self.splitters.split(token_nrm)
                ]

                for subtoken in subtokens:
                    # If punctuation, ignore token and continue
                    if all(char in self.punct for char in token):
                        continue
                    yield subtoken

    def normalize_token(self, token):
        # Apply preprocessing to the token
        token = token.lower().strip().strip('*').strip('.')

        if self.rules:
            token = map_regex_concepts(token)

        return token
Exemplo n.º 16
0
 def __init__(self):
     lexicon = Lexicon(TreebankWordTokenizer())
     word2vec_name = 'word2vec/amazon.bin'
     vocab_size = 100000
     word2vec = Word2VecManager(path.join(Constants.DATASETS,
                                          word2vec_name),
                                vocab_size=vocab_size)
     source = EmbeddingVecSource(lexicon, word2vec)
     self.loader = ImdbDataLoader(source,
                                  root=path.join(Constants.DATASETS,
                                                 'amazon'))
     self.train = 'out'
     self.test = 'test'
Exemplo n.º 17
0
 def __init__(self):
     word2vec_name = 'word2vec/Imdb_min2.bin'
     vocab_size = 100000
     lexicon = Lexicon(TreebankWordTokenizer())
     word2vec = Word2VecManager(path.join(Constants.DATASETS,
                                          word2vec_name),
                                vocab_size=vocab_size)
     source = EmbeddingVecSource(lexicon, word2vec)
     self.loader = ImdbDataLoader(source,
                                  root=path.join(Constants.DATASETS,
                                                 'aclImdb'))
     self.train = 'Mixed'
     self.test = 'All/Test'
Exemplo n.º 18
0
 def predict(self, text):
     spans = list(TreebankWordTokenizer().span_tokenize(text))
     list_of_tokens = [
         text[i:j] if text[i:j] in self.dataset.vec.word2idx else UNK
         for (i, j) in spans
     ]
     tokenized_spans = [None, *spans, None]
     list_of_tokens = [SOS, *list_of_tokens, EOS]
     sequences = list(
         map(lambda s: int(self.dataset.vec.word2idx[s]), list_of_tokens))
     predictions, attentions, conicity_values = self.model.evaluate(
         [sequences])
     predictions = np.array(predictions)
     return predictions[0], attentions[0], tokenized_spans, list_of_tokens
Exemplo n.º 19
0
class TreebankSpanTokenizer(TreebankWordTokenizer):
    def __init__(self):
        self._word_tokenizer = TreebankWordTokenizer()

    def span_tokenize(self, text):
        ix = 0
        for word_token in self.tokenize(text):
            ix = text.find(word_token, ix)
            end = ix + len(word_token)
            yield ix, end, word_token
            ix = end

    def tokenize(self, text):
        return self._word_tokenizer.tokenize(text)
Exemplo n.º 20
0
def _tok_and_norm(corpus_path: str) -> List:
    """
    Tokenizes and normalizes each line in the corpus, rejoins and then appends those lines to a larger list, titled 'tok_sents'.
    """

    tok_sents = []
    with open(corpus_path, 'r') as source:
        for line in tqdm(source):
            # normalize quotation marks/apostrophes for tokenization
            norm_toks = (line.replace("”", '"').replace("“", '"').replace(
                "’", "'").replace("‘", "'").replace("amp;", ""))
            # tokenizes each sentence and appends that tok_sent to the list tok_sents
            tok_sent = TreebankWordTokenizer().tokenize(norm_toks)
            tok_sents.append(tok_sent)

    return tok_sents
def preprocess(df):
    p_stemmer = PorterStemmer()

    tbt = TreebankWordTokenizer()
    custom_en_stop = ['want', 'go', 'hey', 'also', 'ok']

    df = df.apply(lambda row: row.lower())
    df = df.apply(lambda row: re.sub('{.+}', '', row))
    df = df.apply(lambda row: re.sub("[0-9]{1,2} ?(am|pm)", "timeofday", row))
    df = df.apply(lambda row: re.sub("[0-9]{1,2} ?(hours?|hrs?|mins?|minutes?)", "durationtext", row))
    df = df.apply(lambda row: re.sub("[0-9]{10}\D", "phoneorpnr", row))
    df = df.apply(lambda row: word_tokenize(row))
    df = df.apply(lambda row: [WordNetLemmatizer().lemmatize(i) for i in row])
    df = df.apply(lambda row: [i for i in row if i not in string.punctuation])
    df = df.apply(lambda row: [i for i in row if i not in custom_en_stop])
    df = df.apply(lambda x: ' '.join(x))

    return df
Exemplo n.º 22
0
def get_word_ids(query):
    con = clickhouse_driver.connect("clickhouse://127.0.0.1")
    ids = []
    ps = PorterStemmer()
    for word_start, word_end in TreebankWordTokenizer().span_tokenize(query):
        word = query[word_start:word_end]
        stem = ps.stem(word)
        cur = con.cursor()
        cur.execute("SELECT id FROM words WHERE word = %(word)s",
                    {"word": stem})
        row = cur.fetchone()
        if row is None:
            print(
                f"Warning: Word {word} in form of {stem} not found in a database, skipping"
            )
        else:
            id = row[0]
            ids.append(id)
    return ids
Exemplo n.º 23
0
class NormalizationTokenization:
    def __init__(self):
        self.letters_mappings = { u"á" : "a", 
                                  u"é" : "e", 
                                  u"í" : "i",
                                  u"ó" : "o",
                                  u"ú" : "u",
                                  u"ñ" : "n",
                                  u"ü" : "u" }
        self.tokenizer = TreebankWordTokenizer()
        
    def letter_without_accent(self, letter):
        'This method returns the version of a letter without accent'
        if letter in self.letters_mappings:
            return self.letters_mappings[letter]
        else:
            return letter
    
    def normalize(self, text):
        '''This method returns normalized version of the text.
        It removes all disallowed characters and makes the text lower case'''
        text = text.lower()
        mapIterator = map(lambda letter: self.letter_without_accent(letter), text)
        text = "".join(mapIterator)
        
        regex = r'[^a-zA-Z0-9\s\_\-\n]'
        text = re.sub(regex, '', text)
        return text
    
    def tokenize(self, text):
        'This method returns the text diveded to tokens'
        return self.tokenizer.tokenize(text)
    
    def process_text(self, text):
        '''This method is the main method of this class. 
        It processes the text and returns the result'''
        normalized_text = self.normalize(text)
        token_list = self.tokenize(normalized_text)
        return token_list
Exemplo n.º 24
0
    u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    u"|"
    # host name
    u"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
    # domain name
    u"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
    # TLD identifier
    u"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
    u")"
    # port number
    u"(?::\d{2,5})?"
    # resource path
    u"(?:/\S*)?",
    re.UNICODE)

tokenizer = TreebankWordTokenizer()

stopword_set = set(stopwords.words("english"))
punctuation_set = set(string.punctuation)

stemmer = EnglishStemmer()


def process_txt(txt, stem=True):
    words = []
    txt_stripped = url_regex.sub("", txt)
    try:
        for sentence in sent_tokenize(txt_stripped):
            for w in tokenizer.tokenize(sentence):
                w_lower = w.lower()
                if w_lower not in stopword_set and w_lower not in punctuation_set:
Exemplo n.º 25
0
class HindiLanguage(StopWordsFromFileMixIn):
    """Hindi language support module."""

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Hunspell instance
        '__hindi_hunspell',

        # Word tokenizer
        '__treebank_tokenizer',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__treebank_tokenizer = TreebankWordTokenizer()

        hunspell_dict_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'hindi-hunspell',
            'dict-hi_IN',
        )
        if not os.path.isdir(hunspell_dict_dir):
            raise McLanguageException(
                "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir
            )

        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')):
            raise McLanguageException("Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir)
        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')):
            raise McLanguageException("Hunspell affix file does not exist at path: %s" % hunspell_dict_dir)

        try:
            self.__hindi_hunspell = Hunspell(lang='hi_IN', hunspell_data_dir=hunspell_dict_dir)
        except Exception as ex:
            raise McLanguageException(
                "Unable to initialize Hunspell with data directory '%s': %s" % (hunspell_dict_dir, str(ex),)
            )

        # Quick self-test to make sure that Hunspell is installed and dictionary is available
        hunspell_exc_message = """
            Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g.
            you might need to fetch Git submodules by running:

                git submodule update --init --recursive
        """
        try:
            test_stems = self.stem_words(['गुरुओं'])
        except Exception as _:
            raise McLanguageException(hunspell_exc_message)
        else:
            if len(test_stems) == 0 or test_stems[0] != 'गुरु':
                raise McLanguageException(hunspell_exc_message)

    @staticmethod
    def language_code() -> str:
        return "hi"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "ऋषियों को सताने वाले दुष्ट राक्षसों के राजा रावण का सर्वनाश करने वाले "
            "विष्णुवतार भगवान श्रीराम, अयोध्या के महाराज दशरथ के बड़े सपुत्र थे।"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:
                term_stems = self.__hindi_hunspell.stem(word)
                if len(term_stems) > 0:
                    stem = term_stems[0]

                    if stem is None or len(stem) == 0:
                        log.debug("Stem for word '%s' is empty or None." % word)
                        stem = word

                else:
                    log.debug("Stem for word '%s' was not found." % word)
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))

        return stems

    def split_text_to_sentences(self, text: str) -> List[str]:
        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period
        text = text.replace("।", "।\n\n")

        # No non-breaking prefixes in Hausa, so using English file
        en = EnglishLanguage()
        return en.split_text_to_sentences(text)

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        # Normalize apostrophe so that "it’s" and "it's" get treated identically
        sentence = sentence.replace("’", "'")

        # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period
        sentence = sentence.replace("।", ".")

        # TweetTokenizer / sentence_splitter don't work with Hindi for whatever reason, and word_tokenize() would
        # require NLTK data to be installed which is time consuming on Travis
        tokens = self.__treebank_tokenizer.tokenize(sentence)

        def is_word(token_: str) -> bool:
            """Returns True if token looks like a word."""
            if re.match(pattern=r'\w', string=token_, flags=re.UNICODE):
                return True
            else:
                return False

        # TweetTokenizer leaves punctuation in-place
        tokens = [token for token in tokens if is_word(token)]

        return tokens
Exemplo n.º 26
0
    def create_data_list(self, filename_list):
        #return two lists, create id2word and id2ner mapping dicts
        
        data_list = []
        ner_list = []
        self.id2word = {}
        self.id2ner = {}
        ner_id = 1
        word_id = 1

        puncts= "()-,.?!:;*/--"
        
        for filename in filename_list:
            #split train and validation dataset
            if 'Test' in str(filename):
                split = 'test'
            else:
                split = random.choices(["train", "val"], weights = (80, 20), k = 1)[0]  # split train into train 
            
            #parse xml data
            tree = ET.parse(filename)
            root = tree.getroot()
            for elem in root:
                sent_id = elem.get("id")
                sentence = elem.get("text")
                text_tokens = TreebankWordTokenizer().tokenize(sentence)
                text_tokenized = [word.strip(puncts).lower() if word[-1] in puncts else word for word in text_tokens]
                text_tokenized = list(filter(None, text_tokenized)) 
                span_text = list(TreebankWordTokenizer().span_tokenize(sentence)) 
                
                # creat data list            
                char_ids = []
                for st in span_text:
                    char_ids.append((st[0], (st[1]-1)))
                for i, token in enumerate(text_tokenized):
                    if token.lower() not in self.id2word.values():
                        self.id2word[word_id] = token.lower()
                        word_id += 1
                    for id, word in self.id2word.items():
                        if word == token.lower():
                            token_id = id
                    word_info_list = (sent_id, token_id, int(char_ids[i][0]), int(char_ids[i][1]), split)
                    data_list.append(word_info_list)
                  
                # creat NER data list             
                for sub_elem in elem:
                    if sub_elem.tag == "entity":
                        ner = sub_elem.get("type")
                        if ner not in self.id2ner.values():
                            self.id2ner[ner_id] = ner
                            ner_id += 1
                        for id, ner_tmp in self.id2ner.items():
                            if ner_tmp == ner:
                                label = id
                        #get char_start_id and char_end_id
                        if ";" not in sub_elem.get("charOffset"):
                            char_start, char_end = sub_elem.get("charOffset").split("-")
                            char_start, char_end = int(char_start), int(char_end)
                            ner_list.append([sent_id, label, char_start, char_end])
                        #if more than one mention of an entity, split into several lines
                        else:
                            occurences = sub_elem.get("charOffset").split(";")
                            for occurence in occurences:
                                char_start, char_end = occurence.split("-")
                                char_start, char_end = int(char_start), int(char_end)
                                ner_list.append([sent_id, label, char_start, char_end])

        self.vocab = list(self.id2word.values())
        return data_list, ner_list
Exemplo n.º 27
0
import json
import pickle as pk
import numpy as np
import random
import os
from tqdm import tqdm
import re
from glob import glob
from nltk import TreebankWordTokenizer
from string import punctuation

from collections import Counter

_tokenrize = TreebankWordTokenizer().tokenize


_START_VOCAB = ["<unk>", "<pad>", "<stop>"]


def format_data(js):
    data_list = js['data']
    formated = []
    for article in data_list:
        for passage in article['paragraphs']:
            context = passage['context'].strip()  # unicode string
            for qa in passage['qas']:
                q = qa['question']  # unicode string
                # a = qa['answers']   # list of dicts
                answer = [(_['text'].strip(), int(_['answer_start']))
                          for _ in qa['answers']]
                answer = set(answer)
    def recognize(self, text) -> Set[Annotation]:

        annotations = []

        # We normalize the text (Remove all punctuation and replace with whitespace)

        normalized_input_text = self.punctuation_remove.sub(" ", text).replace(
            "-", " ").lower()

        # We split the text into token spans (begin and end position from the start of the text)
        spans = TreebankWordTokenizer().span_tokenize(normalized_input_text)
        token_spans = [i for i in spans]

        # we iterate over tokens one by one until we reach the end of the text
        current_token_span_index = 0
        while current_token_span_index < len(token_spans):
            # we get the current token span
            currentSpan = token_spans[current_token_span_index]

            # we extract the string of the token from the text
            token = normalized_input_text[currentSpan[0]:currentSpan[1]]

            # if the word is a stoplist term or a termination term we skip it
            if token not in self.stop_words and token not in self.termination_terms:
                # We get the concept ids matching the phone of the current token
                token_phone = doublemetaphone(token)[0]
                concepts = self.concepts_from_phone(token_phone)

                # this is the start position of the first token of a matching sequence
                concept_start = currentSpan[0]
                # For now we have matched a single terms, so currently the end position will be that of the current
                # token
                concept_end = currentSpan[1]
                match_cursor = 1
                stop_count = 0
                while current_token_span_index + match_cursor < len(
                        token_spans):

                    # We get the next token and position span
                    next_span = token_spans[current_token_span_index +
                                            match_cursor]
                    next_token = normalized_input_text[
                        next_span[0]:next_span[1]]

                    # if the token is in the termination list the matching process ends here
                    if next_token in self.termination_terms:
                        break
                    # If the token is in the Stop list we skip it and increment the count of the skipped words
                    # We will need to subtract this from the total number of tokens for the concept
                    elif next_token in self.stop_words:
                        stop_count += 1
                    # Otherwise we try to find a match for the token phone in the dictionary index
                    else:
                        # we doublemetaphone the token's text
                        next_token_phone = doublemetaphone(next_token)[0]

                        # We try to find matching concepts and compute the intersection with previously identified
                        # concepts

                        next_concepts = self.concepts_from_phone(
                            next_token_phone) & concepts

                        # if we find none we stop the matching here
                        if len(next_concepts) == 0:

                            break

                        else:
                            # if we find a match, then we update the current end position to that of the currently
                            # matching token and update the intersected matched concept buffer
                            concepts = next_concepts
                            concept_end = next_span[1]

                    # if we arrive here the current token has matched, we keep count of the current match length
                    match_cursor += 1

                # Once we get out of the loop we reconstruct the matches from the concepts remaining in the set
                # after successive intersections, if concepts is empty there was no match and so
                # Tokens.conceptsToAnnotationTokens will return an empty list otherwise we get a list of
                # AnnotationToken objects instances that we add to the list of identified concepts

                for concept in concepts:
                    key_parts = concept.split(":::")
                    concept_id = key_parts[0]
                    annotation = Annotation(
                        concept_id,
                        concept_start,
                        concept_end,
                        text[concept_start:concept_end],
                        match_cursor - stop_count,
                        label_key=concept,
                        concept=self.concept_index[concept])
                    annotations.append(annotation)

            current_token_span_index += 1
        # Here we filter the annotations to keep only those where the concept length matches the length of the
        # identified annotation
        return set([
            annotation for annotation in annotations
            if annotation.matched_length == self.concept_length_index[
                annotation.label_key]
        ])
Exemplo n.º 29
0
 def __init__(self):
     self._word_tokenizer = TreebankWordTokenizer()
Exemplo n.º 30
0
    def tokenize(self, text):

        return TreebankWordTokenizer().tokenize(text)
import nltk.data
from nltk import word_tokenize, TreebankWordTokenizer

# usage app inputFile
# output in the same dir wih name like inputFIle + proprocessed


# wordTokenizer = RegexpTokenizer("[\w']+")


finalOutputFile = open(sys.argv[1] + "_preprocessed_sentences_splitted", 'w')
reviewsJSONFile = open(sys.argv[1], "r")
linenumber = 0

word_tokenizer = TreebankWordTokenizer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

for line in reviewsJSONFile:
    if linenumber % 1000 == 0:
        print(linenumber)
    linenumber += 1
    objJSON = json.loads(line)
    # tokenize and clean the review text
    reviewSTR = objJSON['reviewText']
    excludeSet = string.punctuation + string.digits
    tokenList = []
    sentList = sent_detector.tokenize(reviewSTR.strip())
    for sent in sentList:
        # removes digits punctuations and transforms to lower case.
        sent = ''.join(' ' if ch in set(excludeSet) else ch.lower() for ch in sent)
from nltk import word_tokenize, TreebankWordTokenizer

# usage app inputFile category prefix
# output in the same dir wih name like inputFIle + proprocessed

# wordTokenizer = RegexpTokenizer("[\w']+")


finalOutputFile = open(sys.argv[1] + "_preprocessed", 'w')
reviewsJSONFile = open(sys.argv[1], "r")
prefix = sys.argv[3]

linenumber = 0
dummy_name = 0

word_tokenizer = TreebankWordTokenizer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

for line in reviewsJSONFile:
    if linenumber % 1000 == 0:
        print(linenumber)
    linenumber += 1
    objJSON = json.loads(line)
    # tokenize and clean the review text
    reviewSTR = objJSON['reviewText']
    excludeSet = string.punctuation + string.digits
    tokenList = []
    sentList = sent_detector.tokenize(reviewSTR.strip())
    for sent in sentList:
        # removes digits punctuations and transforms to lower case.
        sent = ''.join(' ' if ch in set(excludeSet) else ch.lower() for ch in sent)