Exemplo n.º 1
0
    def __init__(self, sentence):
        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.status = 0
        self.trans = googletrans.Translator()

        self.sentence = sentence.strip("\n").replace(" ", "")

        en_trans = self.trans.translate(sentence).text
        en_trans = sg.tokenize(en_trans)
        try:
            tree = list(en_parser.parse(en_trans))
            self.tree = tree[0]
            # print(self.tree)
            self.rel = []
        except:
            self.status = 1
def get_sentence_embeddings(sentences, ngram='bigrams', model='concat_wiki_twitter'):
        """ Returns a numpy matrix of embeddings for one of the published models. It
        handles tokenization and can be given raw sentences.
        Arguments:
            - ngram: 'unigrams' or 'bigrams'
            - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
            - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
        """
        wiki_embeddings = None
        twitter_embbedings = None
        tokenized_sentences_NLTK_tweets = None
        tokenized_sentences_SNLP = None
        if model == "wiki" or model == 'concat_wiki_twitter':
            tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
            s = ' <delimiter> '.join(sentences)  # just a trick to make things faster
            tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
            tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
            assert (len(tokenized_sentences_SNLP) == len(sentences))
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                                                            MODEL_PG_3KBOOKS_BIGRAMS, FASTTEXT_EXEC_PATH)

        if model == "wiki":
            return wiki_embeddings
        elif model == "concat_wiki_twitter":
            return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
        sys.exit(-1)
Exemplo n.º 3
0
def tokenize(content):
    """Breaks up text-based content into tokens in the style of PTB corpus"""
    _path_to_jar = os.path.abspath(
        'summarize/stanford-postagger/stanford-postagger.jar')
    token_list = []
    st = StanfordTokenizer(path_to_jar=_path_to_jar)
    content = content.lower()
    token_list = st.tokenize(content)
    return token_list
Exemplo n.º 4
0
def stanfordNERInit():
    os.environ[
        'CLASSPATH'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/stanford-ner.jar:C:/users/home/stanford-ner/stanford-ner-2017-06-09/lib/*:C:/users/home/stanford-postagger-full-2017-06-09/stanford-postagger.jar'
    os.environ[
        'STANFORD_MODELS'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/classifiers/'
    sent_detection = nltk.data.load('tokenizers/punkt/english.pickle')
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    tokenizer = StanfordTokenizer()

    return sent_detection, st, tokenizer
Exemplo n.º 5
0
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})
Exemplo n.º 6
0
def sentence_embeddings(wikiuni, snlpjar, fasttext, sentences, ngram='unigrams', model='concat_wiki_twitter'):
    """ 
    Generate embeddings from a list of sentences.

    Parameters:
    -----------
    wikiuni: string
        Path to the Wikipedia embeddings
    parser: string
        Path to the folder containing the Stanford Parser
    jar: string
        Path to the JAR file of the Stanford tagger
    fasttext: string
        Path to the executable of FastText 
    sentences: list
        List containing raw sentences
        e.g., ['Once upon a time', 'This is another sentence.', ...]
    ngram: string (unigram|bigram)
        ngram used in Wikipedia embeddings
    model: string (wiki|twitter|concat_wiki_twitter)
    """
    wiki_embeddings = None
    twitter_embbedings = None
    tokenized_sentences_NLTK_tweets = None
    tokenized_sentences_SNLP = None
    if model == "wiki" or model == 'concat_wiki_twitter':
        tknzr = StanfordTokenizer(snlpjar, encoding='utf-8')
        s = ' <delimiter> '.join(sentences) #just a trick to make things faster
        tkn_sentences_SNLP = tokenize_sentences(tknzr, [s])
        tkn_sentences_SNLP = tkn_sentences_SNLP[0].split(' <delimiter> ')
        assert(len(tkn_sentences_SNLP) == len(sentences))
        if ngram == 'unigrams':
            wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \
                                     wikiuni, fasttext)
    # We are not using Twitter or Bigrams so far
    #     else:
    #         wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \
    #                                  MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH)
    # if model == "twitter" or model == 'concat_wiki_twitter':
    #     tknzr = TweetTokenizer()
    #     tkn_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences)
    #     if ngram == 'unigrams':
    #         twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \
    #                                  MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH)
    #     else:
    #         twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \
    #                                  MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH)
    #
    if model == "wiki":
        return wiki_embeddings
    #elif model == "twitter":
    #    return twitter_embbedings
    #elif model == "concat_wiki_twitter":
    #    return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
    sys.exit(-1)
Exemplo n.º 7
0
def tokenize_q(qa, phase):
    qas = len(qa)
    MyTokenizer = StanfordTokenizer()
    for i, row in enumerate((qa)):
        row['question_toked'] = MyTokenizer.tokenize(
            row['question'].lower())[:14]
        if i % 50000 == 0:
            json.dump(qa,
                      open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w'))
        if i == qas - 1:
            json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
def tokenize_stopwords_stemmer(texts):
    
    #用斯坦福的分词采用这一段,用普通分词时不用这个
    #tokenize
    Str_texts=texts[0]
    #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar")
    tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") #path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子
    java_path = 'C:/Program Files/Java/jdk1.8.0_121/bin/java.exe'
    os.environ['JAVAHOME'] = java_path
    texts_tokenized=tokenizer.tokenize(Str_texts)#输入必须是字符串,进行分词
    #print(texts_tokenized)

    p1=r'[-@<#$%^&*].+'
    pa1=re.compile(p1)  #re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例)
    texts_filtered0 = [ document for document in  texts_tokenized  if not document in pa1.findall(document) ]
    
    p2=r'.+[-_\/].+'  #将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式
    pa2=re.compile(p2)
    texts_filtered=[]
    for document in  texts_filtered0:
        if document in pa2.findall(document):
            if document.find('_')>-1 : #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list)
                texts_filtered = texts_filtered + document.split('_')
            elif document.find('-')>-1:
                texts_filtered = texts_filtered + document.split('-')
            elif document.find('.')>-1:
                texts_filtered = texts_filtered + document.split('.')
            elif document.find('/')>-1:
                texts_filtered = texts_filtered + document.split('/')
        else:
            texts_filtered.append(document)
    
    texts_filtered = [ document for document in  texts_filtered  if  document != '' and document != "''" and document != "``" ]#过滤掉空格,单引号和--
  
    #stopwords
    english_stopwords = stopwords.words('english')#得到停词
    texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords]#过滤掉停词
    
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','\n'
                            ,'<','>','/','\"','\'','{','}','!','~','`'
                            ,'$','^','/*','*/','/**','**/','**','-','_','+','=',r'-?-',r'@?']#得到标点

    texts_filtered = [ document for document in  texts_filtered_stopwords if not document in english_punctuations]#过滤掉标点
    #print texts_filtered
    temp = texts_filtered[:]  #实现去除带'comment'元素的代码
    for i in temp:
        if 'comment' in i:
            texts_filtered.remove(i)
    #print(texts_filtered)
    #texts_filtered=[re.sub(r'^[1-9]\d*$'.format(punctuation), '', x) for x in texts_filtered]  # ^[1-9]\d*$过滤掉整数

    porter = nltk.PorterStemmer() #词干提取算法
    texts_Stemmered=[porter.stem(t) for t in texts_filtered] #列表类型,提取词干
    return texts_Stemmered #返回一个列表
Exemplo n.º 9
0
    def __init__(self):
        self.dm_single_close_quote = u'\u2019'  # unicode
        self.dm_double_close_quote = u'\u201d'
        self.END_TOKENS = [
            '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote,
            self.dm_double_close_quote, ")"
        ]  # acceptable ways to end a sentence

        # We use these to separate the summary sentences in the .bin datafiles
        self.SENTENCE_START = '<s>'
        self.SENTENCE_END = '</s>'
        self.tokenizer = StanfordTokenizer('stanford-postagger.jar',
                                           options={"tokenizeNLs": True})
Exemplo n.º 10
0
    def __init__(self):

        # set envirinment variable
        # TO DO: update to Docker path
        os.environ['CLASSPATH'] = resource_filename(__name__, 'tokenizers/')

        # load tokenizer and tagger
        # TO DO: again, update to Docker path
        self.STANFORD_TOKENIZER = StanfordTokenizer(
            resource_filename(__name__, 'tokenizers/stanford-ner-3.6.0.jar'))
        self.SMO_tagger = StanfordNERTagger(
            resource_filename(__name__,
                              'classifiers/ner-orgs_2016-03-28_all.ser.gz'))
def spans(txt):
    english_tokenizer = StanfordTokenizer(
        'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar',
        options={
            "americanize": True,
        },
        java_options='-mx1000m')
    tokens = english_tokenizer.tokenize(txt)
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        yield token, offset, offset + len(token)
        offset += len(token)
Exemplo n.º 12
0
    def __init__(self, sentence):

        en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar',
                                   path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar',
                                   )
        sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar')
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).text)

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel=[]
Exemplo n.º 13
0
    def __init__(
            self,
            singleword_spells,
            multiword_spells,
            tokenize_by="text",  #tokenize_by="sentence",
            punkt_tokenizer='tokenizers/punkt/english.pickle',
            path_stanford_jar="/home/david/Descargas/stanford-corenlp-3.8.0.jar"
    ):

        self.singleword_spells = singleword_spells
        self.multiword_spells = multiword_spells
        self.multiword_spells_joint = [
            "_".join(s.split()) for s in multiword_spells
        ]
        self.tokenize_by = tokenize_by
        self.toktok = StanfordTokenizer(path_to_jar=path_stanford_jar)
        self.sent_detector = nltk.data.load(punkt_tokenizer)
Exemplo n.º 14
0
def get_sentence_embeddings(sentences, train, d):
    """ Returns a numpy matrix of embeddings for one of the published models. It
    handles tokenization and can be given raw sentences.
    Arguments:
        - ngram: 'unigrams' or 'bigrams'
        - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
        - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
    """
    tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
    s = ' <delimiter> '.join(sentences)  #just a trick to make things faster
    tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])[0]
    # tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
    assert (len(tokenized_sentences_SNLP) == len(sentences))
    wiki_embeddings = get_embeddings_for_preprocessed_sentences(
        tokenized_sentences_SNLP, MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH,
        train, d)

    return wiki_embeddings
Exemplo n.º 15
0
    def __init__(self, paths_json):
        set_environment_paths(paths_json)

        self.sentence_sequences = []
        self.valence_sequences = []
        self.sentence_trees = []
        self.valence_trees = []
        self.CompleteWordIndices = []

        self.model = ""
        self.models2run = []
        self.neg_scope_method = ""
        self.neg_res_method = ""
        self.sent_comp_method = ""

        valence_dict_path = paths_json["VALENCE_DICT"]
        with open(valence_dict_path) as json_file:
            self.VALENCE_DICT = json.loads(json_file.read())

        negtool_negscopes_path = paths_json["NEGTOOL_NEGSCOPE"]
        self.negtool_neg_scopes_file = open(negtool_negscopes_path, "r")
        self.negtool_neg_scopes_file_current_line = 0
        self.use_negtool = False

        meaning_spec_distribution_dict_path = paths_json[
            "MEANING_SPEC_DISTRIBUTION_DICT_PATH"]
        with open(meaning_spec_distribution_dict_path) as json_file:
            self.distribution_dict = json.loads(json_file.read())
        #window neg scope
        self.window_size = 4

        self.review_id = 0
        self.sentence_id = 0  #for negtool purposes

        #constants
        self.contractions = ["n't", "'m", "'ll", "'d", "'s", "'ve", "'re"]

        #parser and tokenizer initialization
        # self.PARSER = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
        self.TOKENIZER = StanfordTokenizer()

        #using server
        self.CORENLP = StanfordCoreNLP('http://localhost:9000')
Exemplo n.º 16
0
 def __init__(self, data, path, all_names):
     self.data = data
     self.path = path
     self.english_postagger = StanfordPOSTagger(
         path + 'models/english-left3words-distsim.tagger',
         path + 'lib/stanford-postagger-3.4.1.jar',
         java_options='-Xmx2g')
     self.english_tokenizer = StanfordTokenizer(
         path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8')
     self.all_names = all_names
     self.pos = self.extract_POS()
     self.nms = self.extract_names()
     self.wg1 = self.extract_wordgrams(1)
     self.wg2 = self.extract_wordgrams(2)
     self.cg1 = self.extract_chargrams(1)
     self.cg2 = self.extract_chargrams(2)
     self.cg3 = self.extract_chargrams(3)
     self.bl = self.extract_breaklines()
     self.ws = self.extract_websites()
Exemplo n.º 17
0
    def Tok_handler(self, sentence, parser):
        if parser == "spacy":
            try:
                import spacy, en_core_web_sm
            except ImportError:
                print("Can't import spacy")
            nlp = en_core_web_sm.load()
            doc = nlp(sentence)
            return [str(token) for token in doc]
        elif parser == "nltk":
            try:
                import nltk
                from nltk.tokenize.stanford import StanfordTokenizer

                os.environ["CLASSPATH"] = "./StanfordNLP/jars"
                os.environ["STANFORD_MODELS"] = "./StanfordNLP/models"
            except ImportError:
                print("Can't import spacy")
            tokenizer = StanfordTokenizer()
            return tokenizer.tokenize(sentence)
Exemplo n.º 18
0
    def __init__(self, sentence):

        en_parser = StanfordParser(
            path_to_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser.jar',
            path_to_models_jar=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar',
            model_path=
            '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
        )
        sg = StanfordTokenizer(
            path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar'
        )
        self.trans = googletrans.Translator()

        self.sentence = sentence

        result1 = sg.tokenize(self.trans.translate(sentence).get_text())

        tree = list(en_parser.parse(result1))
        self.tree = tree[0]
        self.rel = []
Exemplo n.º 19
0
def tokenize(corenlp, review, span=False):
    r_dict = corenlp._request('ssplit', review)
    tokens2 = StanfordTokenizer().tokenize(review)
    print(r_dict)
    print(tokens2)
    tokens = [
        token['word'] for s in r_dict['sentences'] for token in s['tokens']
    ]

    sentences = []
    current_sentence = []
    for token in tokens:
        if (not bool(re.compile(r'[^\!\?]').search(token))
                or token == "."):  #only ! or ?
            current_sentence.append(token)
            sentences.append(current_sentence)
            current_sentence = []
        else:
            current_sentence.append(token)

    #return [" ".join(sentence[:-1])+sentence[-1] for sentence in sentences] #return sentences
    return sentences  #return tokenized sentences
import emoji

from nltk.tokenize.stanford import StanfordTokenizer

s = "Good muffins :-X cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
tokens = StanfordTokenizer().tokenize(s)

print(tokens)

a = {'a', 'b', 'c'}
b = {'b', 'c', 'd'}
c = {'a', 'd', 'e'}

d = a | b | c

print(d)
Exemplo n.º 21
0
    reader_obj = csv.reader(csvfile)
    for row in reader_obj:
        # row[0] -> id
        # row[1] -> title
        # row[2] -> content
        # row[3] -> tags
        #soup = BeautifulSoup(row[2])
        #if soup.code != None:
        #    codecount += 1
        title_list += " " + row[1]
        check = int(time.time() - start)
        if check >= 10:
            print count, time.time() - start, "seconds"
            start = time.time()
        if not count % 10000:
            word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\
                    ).tokenize(title_list)
            fout.write(' '.join(word_list).encode('utf-8') + "\n")
            title_list = ""
            print(count)
            #word_list = nltk.word_tokenize(row[1].encode('utf-8'))
        #if not codecount%10000:
        #    print codecount
        count += 1
    #print(soup.get_text())
    #pdb.set_trace()

word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\
        ).tokenize(title_list)
fout.write(' '.join(word_list).encode('utf-8') + "\n")
title_list = []
print(count)
print("Tokenizing all requests.")

tweet_tokenizer = TweetTokenizer(preserve_case=True,
                                 reduce_len=True,
                                 strip_handles=True)

tokenized_datasets_original_tweet = [[
    tweet_tokenizer.tokenize(request) for request in dataset
] for dataset in datasets]

print("Retokenizing with Stanford tokenizer. This may take a long time.")

path_pos = "/playpen/home/tongn/stanford-postagger-full-2017-06-09/"
jar_pos = "stanford-postagger.jar"

tokenizer = StanfordTokenizer(path_pos + jar_pos)
tokenizer = StanfordTokenizer(tagger_path)

# tokenized_datasets_original = [
#     [tokenizer.tokenize(' '.join(request).strip())
#      for request in dataset]
#     for dataset in tokenized_datasets_original_tweet]
tokenized_datasets_original = tokenized_datasets_original_tweet
"""
Convert all tokens to lowercase
"""
tokenized_datasets = [[[token.lower() for token in request]
                       for request in dataset]
                      for dataset in tokenized_datasets_original]
"""
Build the whole vocabulary
Exemplo n.º 23
0
#sqlite3 connection
dbname = '/home/aahu/Dropbox/ryancompton.net/assets/praw_drugs/drugs.db'
conn = sqlalchemy.create_engine('sqlite+pysqlite:///' + dbname,
                                module=sqlite3.dbapi2)


def load_subreddit(tablename, conn):
    df = pd.read_sql(tablename, conn)
    return df


# <codecell>

from nltk.tokenize.stanford import StanfordTokenizer
stanfordTokenizer = StanfordTokenizer(
    path_to_jar=
    '/home/aahu/Downloads/stanford-corenlp-full-2015-01-30/stanford-corenlp-3.5.1.jar'
)


def my_tokenize(text):
    return nltk.word_tokenize(text)
    #return nltk.wordpunct_tokenize(text)
    #return stanfordTokenizer.tokenize(text)
    #return nltk.tokenize.TreebankWordTokenizer().tokenize(text)


def build_tfidf_transformer(docs=[],
                            tokenizer=my_tokenize,
                            max_doc_count=2000,
                            vocab_limit=10000):
    """
Exemplo n.º 24
0
# unique_normalized_tokens = set(tokens)
#
# wnl = nltk.WordNetLemmatizer()
# vocabulary = [wnl.lemmatize(t) for t in unique_normalized_tokens]
# print("Aantal vocabulary: ", len(vocabulary))

#
from nltk.parse import stanford
from nltk.parse.stanford import StanfordParser
from nltk.tokenize.stanford import StanfordTokenizer

parser = StanfordParser(
    model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
tokenizer = StanfordTokenizer(
    "/home/spijkervet/stanford/stanford-postagger-full/stanford-postagger-3.9.1.jar"
)

# Stanford parser uses punctuation!
# processed_sentences = [s.translate(s.maketrans('','', string.punctuation)).lower() for s in sentences]
processed_sentences = [s.lower() for s in sentences]

pickle_name = "hp_trees_parser.pickle"
if not os.path.isfile(pickle_name):
    hp_trees = []
    for s in tqdm(processed_sentences):
        # tree = list(parser.raw_parse(s))
        tree = parser.raw_parse(s)
        # for node in tree:
        #     print(node)
        hp_trees.append(tree)
Exemplo n.º 25
0
 def __init__(self):
     self.tk = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
Exemplo n.º 26
0
 def __init__(self, jar_path):
     self.tokenizer = StanfordTokenizer(jar_path)
Exemplo n.º 27
0
 def run(self, data):
     for corpus in data:
         corpus.tokenized_contents = StanfordTokenizer().tokenize(
             corpus.contents)
     return data
gold = []
for each in lis:
    item = each.split("\t")
    gold.append(item[1]) 
    item_email =  ' '.join(e for e in item[2:])
    item_email = item_email.replace('</br>',' ').replace(':',' ')#.split()
    #print(item_email)
    emails.append(item_email)
    emails_len.append(len(item_email))

max_words_email = max(emails_len)
total_email = len(emails)
embedding_size = 700
all_mail_vec = []

tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
s = ' <delimiter> '.join(emails)
tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
emails = tokenized_sentences_SNLP[0].split(' <delimiter> ')


embs_email = model.embed_sentences(emails)  

#each_mail_vec = np.zeros((max_words_email, embedding_size))
#print(vector.shape)
f.close()

f = open('emails_dataset/emailExplanations_Dec23.sorted.txt','r')

concepts = {'REMINDER':[], 'HUMOR' : [], 'EVENT': [], 'EMPLOYEE': [], 'MEETING' : [], 'POLICY' : [], 'CONTACT' : []}
Exemplo n.º 29
0
def stanford_tokenize(s):
	return StanfordTokenizer().tokenize(s)
Exemplo n.º 30
0
        # get next where clause
        where = wheres[:andid]
        # parse current where clause
        sql['conds'].append(parse_where(where))
        # striped processed where clause out of wheres
        wheres = wheres[andid + 3:]

    # parse last where clause
    sql['conds'].append(parse_where(wheres))

    return sql


if __name__ == '__main__':

    stanford = StanfordTokenizer()

    with open('pairs_ronny{}.csv'.format(teststr), 'r+') as f:
        lines_pairs = f.readlines()[1:]
    with open('table_ronny.csv', 'r') as f:
        lines_table = f.readlines()

    # parse table
    entry_table = parse_table(lines_table, stanford)
    with open('dummy_tok.tables.jsonl', 'w') as f:
        json.dump(entry_table, f)

    # parse all pairs
    f = open('dummy_tok{}.jsonl'.format(teststr), 'w+')
    for line in lines_pairs:
        entry = dict(phase=2)