예제 #1
0
    def __init__(self, sentence):
        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.status = 0
        self.trans = googletrans.Translator()

        self.sentence = sentence.strip("\n").replace(" ", "")

        en_trans = self.trans.translate(sentence).text
        en_trans = sg.tokenize(en_trans)
        try:
            tree = list(en_parser.parse(en_trans))
            self.tree = tree[0]
            # print(self.tree)
            self.rel = []
        except:
            self.status = 1
예제 #2
0
def clean_content(page, verbose=True):
    content = page.content
    tag_name = ''
    ret_content = []
    for line in content.splitlines():
        match = re.match('=+ +(.+)? +=+', line)
        # タグかどうか,そうでなければリストに追加
        if not match:
            if len(line) and not tag_name in clean_content.omit_sections:
                ret_content.append(line)
            continue
        # タグ名更新
        tag_name = match.group(1)
        match = re.match('(.+)?Edit', tag_name)
        if match: tag_name = match.group(1)
    # 文頭大文字  文末ピリオド
    st = StanfordTokenizer()
    ret_tokens = []
    for idx, line in enumerate(ret_content):
        if verbose:
            sys.stdout.write('\rParsing "%s" %d / %d'%(page.title, idx+1, len(ret_content)))
            sys.stdout.flush()
        tokens = st.tokenize(line)
        indices = [0]+[i+1 for i, e in enumerate(tokens) if e in ['.','!','?']]
        subtokens = [tokens[indices[i]:indices[i+1]] for i in range(len(indices)-1)]
        ret_tokens.extend(filter(lambda tokens: tokens[0][0].isupper(), subtokens))
    if verbose: sys.stdout.write('\n'); sys.stdout.flush()
    return '\n'.join([' '.join(line) for line in ret_tokens])
예제 #3
0
def tokenize(content):
    """Breaks up text-based content into tokens in the style of PTB corpus"""
    _path_to_jar = os.path.abspath(
        'summarize/stanford-postagger/stanford-postagger.jar')
    token_list = []
    st = StanfordTokenizer(path_to_jar=_path_to_jar)
    content = content.lower()
    token_list = st.tokenize(content)
    return token_list
예제 #4
0
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})
예제 #5
0
def tokenize_q(qa, phase):
    qas = len(qa)
    MyTokenizer = StanfordTokenizer()
    for i, row in enumerate((qa)):
        row['question_toked'] = MyTokenizer.tokenize(
            row['question'].lower())[:14]
        if i % 50000 == 0:
            json.dump(qa,
                      open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w'))
        if i == qas - 1:
            json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
def tokenize_stopwords_stemmer(texts):
    
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts=texts[0]
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") #path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子
    java_path = 'C:/Program Files/Java/jdk1.8.0_121/bin/java.exe'
    os.environ['JAVAHOME'] = java_path
    texts_tokenized=tokenizer.tokenize(Str_texts)#输入必须是字符串,进行分词
    #print(texts_tokenized)

    p1=r'[-@<#$%^&*].+'
    pa1=re.compile(p1)  #re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例)
    texts_filtered0 = [ document for document in  texts_tokenized  if not document in pa1.findall(document) ]
    
    p2=r'.+[-_\/].+'  #将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式
    pa2=re.compile(p2)
    texts_filtered=[]
    for document in  texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_')>-1 : #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list)
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-')>-1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.')>-1:
                texts_filtered = texts_filtered + document.split('.')
            elif document.find('/')>-1:
                texts_filtered = texts_filtered + document.split('/')
        else:
            texts_filtered.append(document)
    
    texts_filtered = [ document for document in  texts_filtered  if  document != '' and document != "''" and document != "``" ]#过滤掉空格,单引号和--
  
    #stopwords
    english_stopwords = stopwords.words('english')#得到停词
    texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords]#过滤掉停词
    
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','\n'
                            ,'<','>','/','\"','\'','{','}','!','~','`'
                            ,'$','^','/*','*/','/**','**/','**','-','_','+','=',r'-?-',r'@?']#得到标点

    texts_filtered = [ document for document in  texts_filtered_stopwords if not document in english_punctuations]#过滤掉标点
    #print texts_filtered
    temp = texts_filtered[:]  #实现去除带'comment'元素的代码
    for i in temp:
        if 'comment' in i:
            texts_filtered.remove(i)
    #print(texts_filtered)
    #texts_filtered=[re.sub(r'^[1-9]\d*$'.format(punctuation), '', x) for x in texts_filtered]  # ^[1-9]\d*$过滤掉整数

    porter = nltk.PorterStemmer() #词干提取算法
    texts_Stemmered=[porter.stem(t) for t in texts_filtered] #列表类型,提取词干
    return texts_Stemmered #返回一个列表
예제 #7
0
    def __init__(self):

        # set envirinment variable
        # TO DO: update to Docker path
        os.environ['CLASSPATH'] = resource_filename(__name__, 'tokenizers/')

        # load tokenizer and tagger
        # TO DO: again, update to Docker path
        self.STANFORD_TOKENIZER = StanfordTokenizer(
            resource_filename(__name__, 'tokenizers/stanford-ner-3.6.0.jar'))
        self.SMO_tagger = StanfordNERTagger(
            resource_filename(__name__,
                              'classifiers/ner-orgs_2016-03-28_all.ser.gz'))
def spans(txt):
    english_tokenizer = StanfordTokenizer(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar',
        options={
            "americanize": True,
        },
        java_options='-mx1000m')
    tokens = english_tokenizer.tokenize(txt)
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        yield token, offset, offset + len(token)
        offset += len(token)
예제 #9
0
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        # We use these to separate the summary sentences in the .bin datafiles
        self.SENTENCE_START = '<s>'
        self.SENTENCE_END = '</s>'
        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})
예제 #10
0
    def __init__(self, sentence):

        en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar',
                                   path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar',
                                   )
        sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar')
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).text)

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel=[]
예제 #11
0
def stanfordTokenizer ( rawText ):
  """
  Uses Stanford University's natural language processing lab
    tokenizer to split raw text.
  """

  jarPath = "/Users/Nathan/nltk_data/stanford-postagger.jar"
  stanfordOptions = {
    "americanize": True,
    "ptb3Escaping": False
  }

  stanfordTokenizer = StanfordTokenizer( jarPath, 'UTF-8', stanfordOptions )

  return stanfordTokenizer.tokenize( rawText )
def get_sentence_embeddings(sentences, ngram='bigrams', model='concat_wiki_twitter'):
        """ Returns a numpy matrix of embeddings for one of the published models. It
        handles tokenization and can be given raw sentences.
        Arguments:
            - ngram: 'unigrams' or 'bigrams'
            - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
            - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
        """
        wiki_embeddings = None
        twitter_embbedings = None
        tokenized_sentences_NLTK_tweets = None
        tokenized_sentences_SNLP = None
        if model == "wiki" or model == 'concat_wiki_twitter':
            tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
            s = ' <delimiter> '.join(sentences)  # just a trick to make things faster
            tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
            tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
            assert (len(tokenized_sentences_SNLP) == len(sentences))
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                                                            MODEL_PG_3KBOOKS_BIGRAMS, FASTTEXT_EXEC_PATH)

        if model == "wiki":
            return wiki_embeddings
        elif model == "concat_wiki_twitter":
            return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
        sys.exit(-1)
예제 #13
0
class tokenizer:
    def __init__(self):
        self.stanford_tokenizer = \
        StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\
                         ,options={"americanize": False})
        pass

    #tokenize with stanford_parser
    def stanford_tokenize(self, row):
        temp_list = self.stanford_tokenizer.tokenize(row)
        return temp_list

    #tokenize with nltk word_tokenizer
    def word_tokenize(self, row):
        temp_list = nltk.word_tokenize(row)
        list_length = len(temp_list)
        index_list = list()
        for i in xrange(list_length):
            if temp_list[i].startswith('\''):
                if len(temp_list[i]) > 3:
                    temp_list[i] = temp_list[i][1:]
                    index_list.append(i)
        #end for
        count = 0
        for index in index_list:
            temp_list.insert(index + count, '\'')
            count += 1
        #end for
        return temp_list

    def no_block(self, string):
        string = re.sub(r' ', '', string)
        return len(string)
예제 #14
0
class tokenizer:
    def __init__(self):
        self.stanford_tokenizer = \
        StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\
                         ,options={"americanize": False});
        pass;

    #tokenize with stanford_parser
    def stanford_tokenize(self,row):
        temp_list = self.stanford_tokenizer.tokenize(row);
        return temp_list;
    
    #tokenize with nltk word_tokenizer
    def word_tokenize(self,row):
        temp_list = nltk.word_tokenize(row);
        list_length = len(temp_list);
        index_list = list();
        for i in xrange(list_length):
            if temp_list[i].startswith('\''):
                if len(temp_list[i]) > 3:
                    temp_list[i] = temp_list[i][1:];
                    index_list.append(i);
        #end for
        count = 0;
        for index in index_list:
            temp_list.insert(index+count,'\'');
            count+=1;
        #end for
        return temp_list;

    def no_block(self ,string):
        string = re.sub(r' ','',string);
        return len(string);
예제 #15
0
파일: ffutils.py 프로젝트: aghie/hpac
    def __init__(
            self,
            singleword_spells,
            multiword_spells,
            tokenize_by="text",  #tokenize_by="sentence",
            punkt_tokenizer='tokenizers/punkt/english.pickle',
            path_stanford_jar="/home/david/Descargas/stanford-corenlp-3.8.0.jar"
    ):

        self.singleword_spells = singleword_spells
        self.multiword_spells = multiword_spells
        self.multiword_spells_joint = [
            "_".join(s.split()) for s in multiword_spells
        ]
        self.tokenize_by = tokenize_by
        self.toktok = StanfordTokenizer(path_to_jar=path_stanford_jar)
        self.sent_detector = nltk.data.load(punkt_tokenizer)
예제 #16
0
def stanfordNERInit():
    os.environ[
        'CLASSPATH'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/stanford-ner.jar:C:/users/home/stanford-ner/stanford-ner-2017-06-09/lib/*:C:/users/home/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
    os.environ[
        'STANFORD_MODELS'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/classifiers/'
    sent_detection = nltk.data.load('tokenizers/punkt/english.pickle')
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    tokenizer = StanfordTokenizer()

    return sent_detection, st, tokenizer
예제 #17
0
 def __init__(self, data, path, all_names):
     self.data = data
     self.path = path
     self.english_postagger = StanfordPOSTagger(
         path + 'models/english-left3words-distsim.tagger',
         path + 'lib/stanford-postagger-3.4.1.jar',
         java_options='-Xmx2g')
     self.english_tokenizer = StanfordTokenizer(
         path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8')
     self.all_names = all_names
     self.pos = self.extract_POS()
     self.nms = self.extract_names()
     self.wg1 = self.extract_wordgrams(1)
     self.wg2 = self.extract_wordgrams(2)
     self.cg1 = self.extract_chargrams(1)
     self.cg2 = self.extract_chargrams(2)
     self.cg3 = self.extract_chargrams(3)
     self.bl = self.extract_breaklines()
     self.ws = self.extract_websites()
예제 #18
0
def sentence_embeddings(wikiuni, snlpjar, fasttext, sentences, ngram='unigrams', model='concat_wiki_twitter'):
    """ 
    Generate embeddings from a list of sentences.

    Parameters:
    -----------
    wikiuni: string
        Path to the Wikipedia embeddings
    parser: string
        Path to the folder containing the Stanford Parser
    jar: string
        Path to the JAR file of the Stanford tagger
    fasttext: string
        Path to the executable of FastText 
    sentences: list
        List containing raw sentences
        e.g., ['Once upon a time', 'This is another sentence.', ...]
    ngram: string (unigram|bigram)
        ngram used in Wikipedia embeddings
    model: string (wiki|twitter|concat_wiki_twitter)
    """
    wiki_embeddings = None
    twitter_embbedings = None
    tokenized_sentences_NLTK_tweets = None
    tokenized_sentences_SNLP = None
    if model == "wiki" or model == 'concat_wiki_twitter':
        tknzr = StanfordTokenizer(snlpjar, encoding='utf-8')
        s = ' <delimiter> '.join(sentences) #just a trick to make things faster
        tkn_sentences_SNLP = tokenize_sentences(tknzr, [s])
        tkn_sentences_SNLP = tkn_sentences_SNLP[0].split(' <delimiter> ')
        assert(len(tkn_sentences_SNLP) == len(sentences))
        if ngram == 'unigrams':
            wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \
                                     wikiuni, fasttext)
    # We are not using Twitter or Bigrams so far
    #     else:
    #         wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \
    #                                  MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH)
    # if model == "twitter" or model == 'concat_wiki_twitter':
    #     tknzr = TweetTokenizer()
    #     tkn_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences)
    #     if ngram == 'unigrams':
    #         twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \
    #                                  MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH)
    #     else:
    #         twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \
    #                                  MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH)
    #
    if model == "wiki":
        return wiki_embeddings
    #elif model == "twitter":
    #    return twitter_embbedings
    #elif model == "concat_wiki_twitter":
    #    return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
    sys.exit(-1)
예제 #19
0
    def Tok_handler(self, sentence, parser):
        if parser == "spacy":
            try:
                import spacy, en_core_web_sm
            except ImportError:
                print("Can't import spacy")
            nlp = en_core_web_sm.load()
            doc = nlp(sentence)
            return [str(token) for token in doc]
        elif parser == "nltk":
            try:
                import nltk
                from nltk.tokenize.stanford import StanfordTokenizer

                os.environ["CLASSPATH"] = "./StanfordNLP/jars"
                os.environ["STANFORD_MODELS"] = "./StanfordNLP/models"
            except ImportError:
                print("Can't import spacy")
            tokenizer = StanfordTokenizer()
            return tokenizer.tokenize(sentence)
예제 #20
0
class Tokenizer(object):
    """
    Tokenize sentence
    """
    def __init__(self, jar_path):
        self.tokenizer = StanfordTokenizer(jar_path)

    def tokenize(self, sentence):
        return self.tokenizer.tokenize(sentence)

    def __call__(self, sentence):
        return self.tokenize(sentence)
예제 #21
0
    def __init__(self, sentence):

        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).get_text())

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel = []
예제 #22
0
class Preprocessor:
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        # We use these to separate the summary sentences in the .bin datafiles
        self.SENTENCE_START = '<s>'
        self.SENTENCE_END = '</s>'
        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})

    def tokenize(self, article):
        return self.tokenizer.tokenize(article)

    def fix_missing_period(self, line):
        """Adds a period to a line that is missing a period"""

        if line == "": return line
        if line[-1] in self.END_TOKENS: return line
        # print line[-1]
        return line + " ."

    def adjust_article(self, article):
        #takes the article t

        # Lowercase everything
        lines = [line.lower() for line in article]

        # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences)
        lines = [self.fix_missing_period(line) for line in lines]

        # Separate out article and abstract sentences
        article_lines = []

        for idx, line in enumerate(lines):
            if line == "":
                continue  # empty line
            else:
                article_lines.append(line)

        # Make article into a single string
        article = ' '.join(article_lines)

        # # Make abstract into a signle string, putting <s> and </s> tags around the sentences
        # abstract = ' '.join(["%s %s %s" % (self.SENTENCE_START, sent, self.SENTENCE_END) for sent in highlights])

        return article
예제 #23
0
def get_sentence_embeddings(sentences, train, d):
    """ Returns a numpy matrix of embeddings for one of the published models. It
    handles tokenization and can be given raw sentences.
    Arguments:
        - ngram: 'unigrams' or 'bigrams'
        - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
        - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
    """
    tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
    s = ' <delimiter> '.join(sentences)  #just a trick to make things faster
    tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])[0]
    # tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
    assert (len(tokenized_sentences_SNLP) == len(sentences))
    wiki_embeddings = get_embeddings_for_preprocessed_sentences(
        tokenized_sentences_SNLP, MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH,
        train, d)

    return wiki_embeddings
예제 #24
0
class Preprocessor:
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})

    def tokenize(self, text):
        return self.tokenizer.tokenize(text)

    def fix_missing_period(self, line):
        """Adds a period to a line that is missing a period"""

        if line == "":
            return line
        if line[-1] in self.END_TOKENS:
            return line
        return line + " ."

    def preprocess_text(self, text):
        """Preprocesses and prepares input text for summarization"""

        # Lowercase everything
        lines = [line.lower() for line in text]
        lines = [self.fix_missing_period(line) for line in lines]

        # Separate out text
        text_lines = []

        for idx, line in enumerate(lines):
            if line == "":
                continue  # empty line
            else:
                text_lines.append(line)

        # Make text into a single string
        text = ' '.join(text_lines)
        return text
예제 #25
0
	def __init__(self,mode=None):
		self.config = GetConfig()
		if mode:
			self.mode = mode
		else:
			if self.config.has_option(self.MY_ID,'mode'):
				self.mode = self.config.get(self.MY_ID,'mode')
			else:
				self.mode = 'NLTK'
		if self.mode == 'STANFORD':
			from nltk.tokenize.stanford import StanfordTokenizer as Tokenizer
			self.tokenizer = Tokenizer()
		elif self.mode == 'NLTK':
			pass
		elif self.mode == 'MINE':
			self.spacePunct = re.compile(ur'[`~!@#\$%\^&\*\(\)\[\]{}_\+\-=\|\\:;\"\'<>,\?/]')
			self.removePunct = re.compile(ur'\.')
		else:
			raise Exception('Error: tokenizer, Unknown mode %s!' %(self.mode))
예제 #26
0
    def __init__(self, paths_json):
        set_environment_paths(paths_json)

        self.sentence_sequences = []
        self.valence_sequences = []
        self.sentence_trees = []
        self.valence_trees = []
        self.CompleteWordIndices = []

        self.model = ""
        self.models2run = []
        self.neg_scope_method = ""
        self.neg_res_method = ""
        self.sent_comp_method = ""

        valence_dict_path = paths_json["VALENCE_DICT"]
        with open(valence_dict_path) as json_file:
            self.VALENCE_DICT = json.loads(json_file.read())

        negtool_negscopes_path = paths_json["NEGTOOL_NEGSCOPE"]
        self.negtool_neg_scopes_file = open(negtool_negscopes_path, "r")
        self.negtool_neg_scopes_file_current_line = 0
        self.use_negtool = False

        meaning_spec_distribution_dict_path = paths_json[
            "MEANING_SPEC_DISTRIBUTION_DICT_PATH"]
        with open(meaning_spec_distribution_dict_path) as json_file:
            self.distribution_dict = json.loads(json_file.read())
        #window neg scope
        self.window_size = 4

        self.review_id = 0
        self.sentence_id = 0  #for negtool purposes

        #constants
        self.contractions = ["n't", "'m", "'ll", "'d", "'s", "'ve", "'re"]

        #parser and tokenizer initialization
        # self.PARSER = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        self.TOKENIZER = StanfordTokenizer()

        #using server
        self.CORENLP = StanfordCoreNLP('http://localhost:9000')
예제 #27
0
class tokenizer(object):
	MY_ID = 'TOKENIZER'
	def __init__(self,mode=None):
		self.config = GetConfig()
		if mode:
			self.mode = mode
		else:
			if self.config.has_option(self.MY_ID,'mode'):
				self.mode = self.config.get(self.MY_ID,'mode')
			else:
				self.mode = 'NLTK'
		if self.mode == 'STANFORD':
			from nltk.tokenize.stanford import StanfordTokenizer as Tokenizer
			self.tokenizer = Tokenizer()
		elif self.mode == 'NLTK':
			pass
		elif self.mode == 'MINE':
			self.spacePunct = re.compile(ur'[`~!@#\$%\^&\*\(\)\[\]{}_\+\-=\|\\:;\"\'<>,\?/]')
			self.removePunct = re.compile(ur'\.')
		else:
			raise Exception('Error: tokenizer, Unknown mode %s!' %(self.mode))

	def tokenize(self, sent):
		if sent.endswith('-') or sent.endswith('~'):
			sent += ' '
		sent = sent.replace('~ ', ' ~ ')
		sent = sent.replace('- ', ' - ')
		if self.mode == 'STANFORD':
			tokens = self.tokenizer.tokenize(sent.strip())
		elif self.mode == 'NLTK':
			tokens = nltk.word_tokenize(sent.strip())
		elif self.mode == 'MINE':
			new_sent = sent.strip()
			new_sent = self.spacePunct.sub(' ', new_sent)
			new_sent = self.removePunct.sub('', new_sent)
			tokens = new_sent.split()
		p_sent = ' '.join(tokens)
		p_sent = p_sent.replace('% ', '%')
		p_sent = p_sent.replace('``', '\"')
		p_sent = p_sent.replace('\'\'', '\"')
		p_tokens = p_sent.split(' ')
		return p_tokens
예제 #28
0
def tokenize(corenlp, review, span=False):
    r_dict = corenlp._request('ssplit', review)
    tokens2 = StanfordTokenizer().tokenize(review)
    print(r_dict)
    print(tokens2)
    tokens = [
        token['word'] for s in r_dict['sentences'] for token in s['tokens']
    ]

    sentences = []
    current_sentence = []
    for token in tokens:
        if (not bool(re.compile(r'[^\!\?]').search(token))
                or token == "."):  #only ! or ?
            current_sentence.append(token)
            sentences.append(current_sentence)
            current_sentence = []
        else:
            current_sentence.append(token)

    #return [" ".join(sentence[:-1])+sentence[-1] for sentence in sentences] #return sentences
    return sentences  #return tokenized sentences
예제 #29
0
def stanford_tokenize(s):
	return StanfordTokenizer().tokenize(s)
gold = []
for each in lis:
    item = each.split("\t")
    gold.append(item[1]) 
    item_email =  ' '.join(e for e in item[2:])
    item_email = item_email.replace('</br>',' ').replace(':',' ')#.split()
    #print(item_email)
    emails.append(item_email)
    emails_len.append(len(item_email))

max_words_email = max(emails_len)
total_email = len(emails)
embedding_size = 700
all_mail_vec = []

tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
s = ' <delimiter> '.join(emails)
tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
emails = tokenized_sentences_SNLP[0].split(' <delimiter> ')


embs_email = model.embed_sentences(emails)  

#each_mail_vec = np.zeros((max_words_email, embedding_size))
#print(vector.shape)
f.close()

f = open('emails_dataset/emailExplanations_Dec23.sorted.txt','r')

concepts = {'REMINDER':[], 'HUMOR' : [], 'EVENT': [], 'EMPLOYEE': [], 'MEETING' : [], 'POLICY' : [], 'CONTACT' : []}
import emoji

from nltk.tokenize.stanford import StanfordTokenizer

s = "Good muffins :-X cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
tokens = StanfordTokenizer().tokenize(s)

print(tokens)

a = {'a', 'b', 'c'}
b = {'b', 'c', 'd'}
c = {'a', 'd', 'e'}

d = a | b | c

print(d)
예제 #32
0
    reader_obj = csv.reader(csvfile)
    for row in reader_obj:
        # row[0] -> id
        # row[1] -> title
        # row[2] -> content
        # row[3] -> tags
        #soup = BeautifulSoup(row[2])
        #if soup.code != None:
        #    codecount += 1
        title_list += " " + row[1]
        check = int(time.time() - start)
        if check >= 10:
            print count, time.time() - start, "seconds"
            start = time.time()
        if not count % 10000:
            word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\
                    ).tokenize(title_list)
            fout.write(' '.join(word_list).encode('utf-8') + "\n")
            title_list = ""
            print(count)
            #word_list = nltk.word_tokenize(row[1].encode('utf-8'))
        #if not codecount%10000:
        #    print codecount
        count += 1
    #print(soup.get_text())
    #pdb.set_trace()

word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\
        ).tokenize(title_list)
fout.write(' '.join(word_list).encode('utf-8') + "\n")
title_list = []
print(count)
print("Tokenizing all requests.")

tweet_tokenizer = TweetTokenizer(preserve_case=True,
                                 reduce_len=True,
                                 strip_handles=True)

tokenized_datasets_original_tweet = [[
    tweet_tokenizer.tokenize(request) for request in dataset
] for dataset in datasets]

print("Retokenizing with Stanford tokenizer. This may take a long time.")

path_pos = "/playpen/home/tongn/stanford-postagger-full-2017-06-09/"
jar_pos = "stanford-postagger.jar"

tokenizer = StanfordTokenizer(path_pos + jar_pos)
tokenizer = StanfordTokenizer(tagger_path)

# tokenized_datasets_original = [
#     [tokenizer.tokenize(' '.join(request).strip())
#      for request in dataset]
#     for dataset in tokenized_datasets_original_tweet]
tokenized_datasets_original = tokenized_datasets_original_tweet
"""
Convert all tokens to lowercase
"""
tokenized_datasets = [[[token.lower() for token in request]
                       for request in dataset]
                      for dataset in tokenized_datasets_original]
"""
Build the whole vocabulary
예제 #34
0
def test_stanford_tokenizer():
    files = os.listdir("/Users/ruben/Desktop/txt/")
    standfor = StanfordTokenizer()
    total = sum(len(standfor.tokenize(readfile(DOCS_TXT_ROOT + f))) for f in files)

    print "\nStanfordTokenizer total " + str(total)
예제 #35
0
class Values(object):
    def __init__(self, data, path, all_names):
        self.data = data
        self.path = path
        self.english_postagger = StanfordPOSTagger(
            path + 'models/english-left3words-distsim.tagger',
            path + 'lib/stanford-postagger-3.4.1.jar',
            java_options='-Xmx2g')
        self.english_tokenizer = StanfordTokenizer(
            path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8')
        self.all_names = all_names
        self.pos = self.extract_POS()
        self.nms = self.extract_names()
        self.wg1 = self.extract_wordgrams(1)
        self.wg2 = self.extract_wordgrams(2)
        self.cg1 = self.extract_chargrams(1)
        self.cg2 = self.extract_chargrams(2)
        self.cg3 = self.extract_chargrams(3)
        self.bl = self.extract_breaklines()
        self.ws = self.extract_websites()

    def getVals(self):
        return self.bl, self.wg1, self.wg2, self.ws, self.nms, self.pos, self.cg1, self.cg2, self.cg3

    def extract_POS(self):
        return self.english_postagger.tag(
            self.english_tokenizer.tokenize(self.data))

    def extract_websites(self):
        websites = []
        result = re.findall('href=\"(.*?)\"', self.data)
        for r in result:
            if (r == 'mailto:') or (r == 'http:///'): continue
            else: websites.append(r)
        return websites

    def extract_breaklines(self):
        breaklines = []
        idx_old = 0
        idx_new = self.data.find('<br>')
        breaklines.append(idx_new - idx_old)
        idx_old = idx_new
        while idx_old < len(self.data):
            idx_new = self.data.find('<br>', idx_old + 4)
            if (idx_new == -1): break
            breaklines.append(idx_new - idx_old)
            idx_old = idx_new
        return breaklines

    def extract_chargrams(self, gram_size):
        return [
            ''.join(self.data[i:i + gram_size])
            for i in range(len(self.data) - gram_size + 1)
        ]

    def extract_wordgrams(self, gram_size):
        r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format(
            re.escape(punctuation)))
        word_list = r.split(self.data)
        #word_list = re.split('\W+', self.data)
        #word_list = re.split(r'[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data)
        word_list = filter(None, word_list)
        return [
            ''.join(word_list[i:i + gram_size])
            for i in range(len(word_list) - gram_size + 1)
        ]

    def extract_names(self):
        r = re.compile(r'[\s{}\t\n\r\+\>\<\=\¢\â\$]+'.format(
            re.escape(punctuation)))
        word_list = r.split(self.data)
        #word_list = re.split('\W+', self.data)
        #word_list = re.split('[\p{P} \\t\\n\\r\\+\\>\\<\\=\\¢\\â\\$]+', self.data)
        word_list = filter(None, word_list)
        word_list = [x.lower() for x in word_list]
        return list(set(word_list) & set(self.all_names))
예제 #36
0
#sqlite3 connection
dbname = '/home/aahu/Dropbox/ryancompton.net/assets/praw_drugs/drugs.db'
conn = sqlalchemy.create_engine('sqlite+pysqlite:///' + dbname,
                                module=sqlite3.dbapi2)


def load_subreddit(tablename, conn):
    df = pd.read_sql(tablename, conn)
    return df


# <codecell>

from nltk.tokenize.stanford import StanfordTokenizer
stanfordTokenizer = StanfordTokenizer(
    path_to_jar=
    '/home/aahu/Downloads/stanford-corenlp-full-2015-01-30/stanford-corenlp-3.5.1.jar'
)


def my_tokenize(text):
    return nltk.word_tokenize(text)
    #return nltk.wordpunct_tokenize(text)
    #return stanfordTokenizer.tokenize(text)
    #return nltk.tokenize.TreebankWordTokenizer().tokenize(text)


def build_tfidf_transformer(docs=[],
                            tokenizer=my_tokenize,
                            max_doc_count=2000,
                            vocab_limit=10000):
    """
예제 #37
0
 def __init__(self):
     self.stanford_tokenizer = \
     StanfordTokenizer('../stanford-parser-2010-08-20/stanford-parser.jar'\
                      ,options={"americanize": False});
     pass;