Exemplo n.º 1
0
 def __init__(self,sentence, graph):
     self.Construct_Pattern_House()
     self.sentence = sentence
     self.rdfgraph = graph
    # self.sentence = sentence
     self.st = StanfordPOSTagger('chinese-distsim.tagger')
     self.nodecount = dict()
Exemplo n.º 2
0
    def __add_basic_pos_tag(df):
        pos_path_jar = "./stanford-postagger-full-2017-06-09/stanford-postagger.jar"
        pos_path_model = "./stanford-postagger-full-2017-06-09/models/english-left3words-distsim.tagger"
        pos_tagger = StanfordPOSTagger(pos_path_model, pos_path_jar)

        pos = [pos_tagger.tag(s) for s in [df.word]]

        pos = [i[1] for i in pos[0]]

        pos = pd.DataFrame(pos)

        df['pos'] = pos

        return df
Exemplo n.º 3
0
 def __init__(self, filename):
     self.filename = filename
     self.tokenizer = TreebankWordTokenizer()
     self.sent_tokenizer = load(
         'tokenizers/punkt/{0}.pickle'.format('english'))
     self.st = StanfordPOSTagger(
         '../stanfordPOStagger/english-bidirectional-distsim.tagger',
         '../stanfordPOStagger/stanford-postagger.jar',
         java_options='-mx2048m')
     #self.w2v_model = KeyedVectors.load_word2vec_format(
     #    "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz",
     #    binary=True)
     self.w2v_model = None
     self.text = self.get_text()
     self.anns = []
     self.idx_list = IdxList()
     self.punct = punctuation + '‘’— \t\n'
Exemplo n.º 4
0
    def workflow_resources(self):
        corpus_encoding = self.task_config["CORPUS_ENCODING"]
        stanford_postagger_path = self.task_config["STANFORD_POSTAGGER_PATH"]
        stanford_models_path = self.task_config["STANFORD_MODELS_PATH"]
        stanford_pos_model_path = self.task_config["STANFORD_POS_MODEL_PATH"]

        tokenizer = StanfordTokenizer(stanford_models_path,
                                      encoding=corpus_encoding)
        pos_tagger = StanfordPOSTagger(stanford_pos_model_path,
                                       path_to_jar=stanford_postagger_path,
                                       encoding=corpus_encoding)

        workflow_resources = {"tokenizer": tokenizer, "pos_tagger": pos_tagger}

        return workflow_resources
Exemplo n.º 5
0
class POSTagger(BaseEstimator, TransformerMixin):
    def __init__(self, models_path=None):
        models_path = models_path or os.environ["MODELS_PATH"]
        jar_file = Path(models_path, "stanford-postagger.jar")
        tagger_file = Path(models_path, "spanish.tagger")

        self.tagger = StanfordPOSTagger(str(tagger_file), str(jar_file))

    def tag(self, token_list):
        tags = self.tagger.tag(token_list)
        _, tags = zip(*tags)
        return list(tags)

    def transform(self, x, y=None):
        return [self.tag(sequence) for sequence in x]
Exemplo n.º 6
0

os.environ[
    "STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER"

from nltk.tag.stanford import StanfordNERTagger

stanford_NER_tagger = StanfordNERTagger(
    '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english.all.3class.distsim.crf.ser.gz',
    '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-ner.jar')

from nltk import StanfordPOSTagger
os.environ[
    "STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER"
stanford_POS_tagger = StanfordPOSTagger(
    '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english-bidirectional-distsim.tagger',
    '/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-postagger.jar')

with open('QA_train.json') as data_file:
    data = json.load(data_file)[:100]

stopwords = set(nltk.corpus.stopwords.words(
    'english'))  # wrap in a set() (see below) ############## Remove from below
stopwords.remove('the')
stopwords.remove('of')

stemmer = nltk.stem.PorterStemmer()  ########

PunctuationExclude = set(string.punctuation)  ############
PunctuationExclude.remove(',')
PunctuationExclude.remove('-')
Exemplo n.º 7
0
import pandas as pd
import numpy as np
import nltk
import re
from os.path import expanduser
from nltk import StanfordPOSTagger
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import brown
import spacy

home = expanduser("~")
_path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
_path_to_jar = home + '/stanford-postagger/stanford-postagger.jar'

st = StanfordPOSTagger(_path_to_model, _path_to_jar)

shitThing = ['.', ',', '-', '(', ')', ':']

test = pd.read_csv(
    '/Applications/Study/UWM/628/module2/textUsing/chineseAllReview.csv')
test.head(5)

tagList = [
    'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
    'VBZ'
]


def wordTrans(self):
    testBag = nltk.pos_tag(word_tokenize(self))
Exemplo n.º 8
0
def contentToList(page_content):
    list = sent_tokenize(page_content)
    # list = page_content.split(' ')
    print(list)
    cleanList = []
    list_with_startelement_numbers = []  # enthält Start item aller Redetexte
    list_with_startEnd_numbers = [
    ]  # enthält Start und Ende item aller Redetexte
    # hallo ihr
    # meine dfsdkfsdfsd

    for i in range(len(list)):
        list_element = list[i]
        list_element = list_element.replace("\n", "")
        list_element = list_element.replace("-", "")
        cleanList.append(list_element)  # liste ohne -, \n
        #print("item at index", i, ":", list_element)       # alle Listenelemente

        start_Element_Rede = 0
        '''analysiere Struktur list_element'''
        ''' nachdem Präsident Lammert das Wort übergibt, beginnt eine Rede'''
        matchers = ['Das Wort', 'das Wort']
        if any(m in list_element for m in matchers):
            print("item at index", i, ":",
                  list_element)  # Listenelemente, die matchers enthalten
            start_Element_Rede = i + 1
            list_with_startelement_numbers.append(start_Element_Rede)
            print("Start_Index_Redetext: ", start_Element_Rede)
            '''- POS -> PartOfSpeech Verben, Nomen, ... in Listenelement mit matchers'''
            words = word_tokenize(list_element)
            '''extracting Named Entities - Person, Organization,...'''
            jar = 'jars/stanford-postagger.jar'
            model = 'jars/german-hgc.tagger'
            pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')
            text = pos_tagger.tag(pos_tagger)
            print(text)

            namedEnt = ne_chunk(tagged)
            print(namedEnt)

            #namedEnt.draw()

            def extract_entity_names(namedEnt):
                entityPers_names = []
                if hasattr(namedEnt, 'label') and namedEnt.label:
                    if namedEnt.label(
                    ) == 'PERSON':  #or namedEnt.label() == 'ORGANIZATION':
                        entityPers_names.append(' '.join(
                            [child[0] for child in namedEnt]))
                    else:
                        for child in namedEnt:
                            entityPers_names.extend(
                                extract_entity_names(child))
                return entityPers_names

            entityPerson_names = []
            entityPerson_names.extend(extract_entity_names(namedEnt))
            # Print all entity names
            print("Person: " + str(entityPerson_names))
            ''' Excel-sheet with all politicans '''
            workbook = xlrd.open_workbook('mdb.xls')
            worksheet = workbook.sheet_by_name('Tabelle1')
            # Value of 1st row and 1st column
            value_of_first_col_Names = []
            value_of_second_col_Party = []
            first_col_Names = worksheet.col_values(0)
            second_col_Party = worksheet.col_values(1)
            print(first_col_Names)
            print(second_col_Party)

            matchers = first_col_Names
            politican_name = ""
            party_name = ""
            for i in range(len(entityPerson_names)):
                list_element = entityPerson_names[i]
                for m in range(len(matchers)):
                    matcher_element = matchers[m]
                    if matcher_element in list_element:
                        print("listen_eintrag", i, ": ", list_element)
                        print("excel_eintrag_name", m, ": ", matcher_element)
                        print("excel_eintrag_partei", m, ": ",
                              second_col_Party[m])
                        politican_name = matcher_element
                        party_name = second_col_Party[m]
                        ''' Eintrag in DB Name + Partei'''
            ''' Anbindung API-Abgeordnetenwatch - JSON Data-Extract'''
            # import urllib.request, json
            # politican_name = politican_name.lower()
            # print(politican_name)
            # politican_name = politican_name.replace(' ','-')
            # print(politican_name)
            # with urllib.request.urlopen("https://www.abgeordnetenwatch.de/api/profile/"+politican_name+"/profile.json") as url:
            #     data = json.loads(url.read().decode())
            #     print(data)
            #     print(data['profile']['personal']['first_name']+ " " +data['profile']['personal']['last_name'])
            #     print(data['profile']['party'])
            ''' Eintrag in DB Name + Partei'''

    print("Liste mit Startnummern: ", list_with_startelement_numbers)
    # jede zweite Startnummer (= Ende) um 1 mindern für Ende einer Rede
    # [start:end:stop]
    # print(list_with_startelement_numbers[1::2])
    for value in range(1, len(list_with_startelement_numbers), 2):
        list_with_startelement_numbers[
            value] = list_with_startelement_numbers[value] - 1
        #print(list_with_startelement_numbers)
    list_with_startEnd_numbers = list_with_startelement_numbers  # list_with_startEnd_numbers enthält Start und Ende item(Nummern) aller Redetexte
    print("Liste mit Start + Endnummern: ", list_with_startEnd_numbers)

    for item in range(len(cleanList)):
        element = cleanList[item]
        #print("item at index", item, ":", element)

    alle_Reden = []
    x = 0
    y = 1
    start = 1
    print(len(list_with_startEnd_numbers))
    end = len(list_with_startEnd_numbers) - 1
    active = True
    while active:
        print("x: ", x)
        print("y: ", y)
        print("start: ", start)
        if start > end:
            active = False
            print("false")
        else:
            alle_Reden.append(cleanList[
                list_with_startEnd_numbers[x]:list_with_startEnd_numbers[y]]
                              )  # [alle zwischen Start:Ende]
            #print("weiter")
            #print("start: ", start)
        x += 2
        y += 2
        start += 2

    # Ausgabe aller Reden
    for rede in alle_Reden:
        print(rede)
        print("\n")
Exemplo n.º 9
0
#Or you can give "DEV" here provided that that dataset is available in the same directory
runOn = "Test"

#This switch can be used to use a relaxed evaluation matric which awards for partial matches and match in many possible answers
#function defined later on, also can refer report
#if False then will only give a score 1 if exact match with correct answer (Default in project)
relaxedEvaluationMetric = False

# printing start time of the script
# This script should not take more that 4 or 5 minutes
print("Start Time:", ctime())

# initializing taggers and modals from NLTK
stanford_NER_tagger = StanfordNERTagger(
    'english.all.3class.distsim.crf.ser.gz')
stanford_POS_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
stemmer = nltk.stem.PorterStemmer()

# os.environ["STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER"
# stanford_NER_tagger = StanfordNERTagger('/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english.all.3class.distsim.crf.ser.gz','/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-ner.jar')
# stanford_POS_tagger = StanfordPOSTagger('/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english-bidirectional-distsim.tagger','/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-postagger.jar')
# stemmer = nltk.stem.PorterStemmer()

##Some path declarations for the precomputed models
# This is the cache file that will store the precomputed best sentences and tags
# so that we dont have to tag each time we run this script
if runOn == "DEV":
    fname = "bestSentencesTaggedEnhancedDev.bin"
else:
    fname = 'bestSentencesTaggedEnhancedTest.bin'
QuestionModelPATH = "QuestionClassificationModelStanford.pickle"
Exemplo n.º 10
0
t0 = time.time()
datas = 'data/QA_dev.json'
print datas


from nltk import StanfordNERTagger, StanfordPOSTagger

dataset = json.loads(open(path.join(parent_path, datas)).readline())

ner_tagger = StanfordNERTagger(path.join(parent_path, 'data/english.all.3class.distsim.crf.ser.gz'),
                               path.join(parent_path, 'data/stanford-ner.jar'),
                               encoding='utf-8')

pos_tagger = StanfordPOSTagger(path.join(parent_path, 'data/wsj-0-18-left3words-distsim.tagger'),
                               path.join(parent_path, 'data/stanford-postagger.jar'),
                               encoding='utf-8')

prog_total = len(dataset)


def dmerge(ner, pos):
    if pos and pos[1] == 'CD':
        return ner[0], 'NUMBER'
    elif ner[1] == 'O':
        return pos
    else:
        return ner


def _merge_tag(ners, poss):
Exemplo n.º 11
0
import pandas as pd
import numpy as np
import nltk
import re
from os.path import expanduser
from nltk import StanfordPOSTagger
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import brown
import spacy

home = expanduser("~")
_path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
_path_to_jar = home + '/stanford-postagger/stanford-postagger.jar'

st = StanfordPOSTagger(_path_to_model, _path_to_jar)

qqExample = pd.read_csv('/Applications/Study/UWM/628/module2/qq.csv',
                        index_col=0)
qqExample.index = range(0, len(qqExample))

i = 3
qqExample.text[i]
nltk.pos_tag(word_tokenize(qqExample.text[i]))
st.tag(word_tokenize(qqExample.text[i]))
st.tag_sents([sent_tokenize(qqExample.text[i])])

qqAll = '. '.join(qqExample.text)
len(qqAll)
nltk.pos_tag(word_tokenize(qqAll))
st.tag(word_tokenize(qqAll))
Exemplo n.º 12
0
        return "LOCATION"
    elif "who" in question.lower():
        return "PERSON"
    elif "how many" in question.lower() or "number" in question.lower() or "count" in question.lower():
        return "NUMBER"
    elif "when" in question.lower() or "date" in question.lower():
        return "NUMBER"
    else:
        return "OTHER"




from nltk import StanfordPOSTagger
os.environ["STANFORD_MODELS"] = "/Users/umeraltaf/Desktop/QA_Project/StanfordNER"
stanford_tagger = StanfordPOSTagger('/Users/umeraltaf/Desktop/QA_Project/StanfordNER/english-bidirectional-distsim.tagger','/Users/umeraltaf/Desktop/QA_Project/StanfordNER/stanford-postagger.jar')




correct = 0
possCorrect = 0
wrongNumber = 0
totalans = 0
multiAnswer = 0
i = -1 #index of our NER_TAGGED list (i.e. questions)
for article in data:
    for question in article['qa']:

        i+=1
        taggedBestAnswerSent = NER_tagged[i]
Exemplo n.º 13
0
import sys

#print(len(sys.argv))
assert (len(sys.argv) == 4)
afile = sys.argv[1]
qfile = sys.argv[2]
numq = int(sys.argv[3])

#print(afile,qfile,numq)

nlpspacy = spacy.load("en_core_web_md")

stanforddir = 'stanford-postagger-2018-10-16/'
modelfile = stanforddir + 'models/english-bidirectional-distsim.tagger'
jarfile = stanforddir + 'stanford-postagger.jar'
postagger = StanfordPOSTagger(model_filename=modelfile, path_to_jar=jarfile)

#nltk.download("punkt")
#nltk.download("wordnet")
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')


def posToWordnet(pos):

    first = pos[0]
    if first == 'J':
        return 'a'
    elif first == 'V':
        return 'v'
Exemplo n.º 14
0
        postings = index[term]
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator.most_common(k)


############ End of coppied code

#printing start time of the script
print("Start Time:", ctime())

#initializing taggers and modals from NLTK
#os.environ["STANFORD_MODELS"] = "/chechi/Documents/StanfordNER"
stanford_NER_tagger = StanfordNERTagger(
    'english.all.3class.distsim.crf.ser.gz')
stanford_POS_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger')
stemmer = nltk.stem.PorterStemmer()

#This is the cache file that will store the precomputed best sentences and tags
#so that we dont have to tag each time we run this script
if (runOn == "DEV"):
    fname = 'bestSentencesTaggedDev.bin'
else:
    fname = 'bestSentencesTaggedTrain.bin'

#This variable will store all tagged most relevant sentences
NER_tagged = None

#Load the dataset, note that as train set is large I only load first 50 articles

if (runOn == "DEV"):
 sys.stdout.write("\t")
 for tok in token2:
    sys.stdout.write("\t")
    sys.stdout.write(tok.rjust(8))
 print()
 for j in range(0,len(v.state)):
     sys.stdout.write(v.state[j])
     sys.stdout.write("\t")
     for i in range(0,len(token2)):
         sys.stdout.write("\t")
         sys.stdout.write(str(round((Viterbi_matrix2[i][j]),5)))
         sys.stdout.write("\t")
     print()
     
 print("--------------------------------------------------------------------------------")
 
 #Stanford POS Tagging
 stanford_dir = "C:/stanford-postagger/" # change it into your own path
 model_file= stanford_dir + 'models/english-left3words-distsim.tagger'
 jarfile = stanford_dir +"stanford-postagger.jar"# jar file
 st = StanfordPOSTagger(model_filename=model_file, path_to_jar=jarfile)
 
 print("\nSentence 1: "+seq1)
 tokens1 = word_tokenize(seq1) # tokenize into words
 print("Using Stanford POS Tagging, Sentence 1 is tagged as: ")
 print(st.tag(seq1.split()))
 
 print("\nSentence 2: "+seq2)
 tokens2 = word_tokenize(seq2) # tokenize into words
 print("Using Stanford POS Tagging, Sentence 2 is tagged as: ")
 print(st.tag(seq2.split()))
Exemplo n.º 16
0
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import StanfordPOSTagger
from os.path import expanduser

home = expanduser("~")
_path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
_path_to_jar = home + '/stanford-postagger/stanford-postagger.jar'

st = StanfordPOSTagger(_path_to_model, _path_to_jar)


def sentenceClean2(self):
    result = self.lower()
    result = re.sub('high\squality', 'great', result)
    result = re.sub('low\squality', 'bad', result)
    result = re.sub('serve', 'service', result)
    result = re.sub('fast food', 'fastfood', result)
    result = re.sub('n\'t\s', ' not', result)
    result = re.sub('\sstars|\sstar', 'stars', result)
    result = re.sub('0stars', 'onestars', result)
    result = re.sub('1stars', 'onestars', result)
    result = re.sub('2stars', 'twostars', result)
    result = re.sub('3stars', 'threestars', result)
    result = re.sub('4stars', 'fourstars', result)
    result = re.sub('5stars', 'fivestars', result)
    result = re.sub('\snot\s', ' not', result)
    result = re.sub('\snever\s', ' never', result)
Exemplo n.º 17
0
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import StanfordPOSTagger
from os.path import expanduser

home = expanduser("~")
_path_to_model = home + '/stanford-postagger/models/english-bidirectional-distsim.tagger'
_path_to_jar = home + '/stanford-postagger/stanford-postagger.jar'

st = StanfordPOSTagger(_path_to_model, _path_to_jar)


def sentenceClean2(self):
    result = self.lower()
    result = re.sub('high\squality', 'great', result)
    result = re.sub('low\squality', 'bad', result)
    result = re.sub('serve', 'service', result)
    result = re.sub('fast food', 'fastfood', result)
    result = re.sub('\s\w+?n\'t[^\w]+?', ' not', result)
    result = re.sub('\sstars|\sstar', 'stars', result)
    result = re.sub('1stars', 'onestars', result)
    result = re.sub('2stars', 'twostars', result)
    result = re.sub('3stars', 'threestars', result)
    result = re.sub('4stars', 'fourstars', result)
    result = re.sub('5stars', 'fivestars', result)
    result = re.sub('\snot\s', ' not', result)
    result = re.sub('\snever\s', ' never', result)
    return result
Exemplo n.º 18
0
class AnnotationCompiler:
    def __init__(self, filename):
        self.filename = filename
        self.tokenizer = TreebankWordTokenizer()
        self.sent_tokenizer = load(
            'tokenizers/punkt/{0}.pickle'.format('english'))
        self.st = StanfordPOSTagger(
            '../stanfordPOStagger/english-bidirectional-distsim.tagger',
            '../stanfordPOStagger/stanford-postagger.jar',
            java_options='-mx2048m')
        #self.w2v_model = KeyedVectors.load_word2vec_format(
        #    "C:/Users/PC1/Desktop/python/деплом/deplom/constructions/GoogleNews-vectors-negative300.bin.gz",
        #    binary=True)
        self.w2v_model = None
        self.text = self.get_text()
        self.anns = []
        self.idx_list = IdxList()
        self.punct = punctuation + '‘’— \t\n'

    def get_text(self):
        with open(self.filename, encoding='utf-8-sig', newline='') as f:
            text = f.read()
        return text

    def correct_text(self, corrs, abs_idx=False):
        # given corrections (start idx,initial text, correction), correct the text
        slices = []
        last_idx = 0
        for item in corrs:
            if abs_idx:
                idx, initial, corr = item
                sent_idx = 0
            else:
                idx, sent_idx, initial, corr = item
            slices.append(self.text[last_idx:idx + sent_idx])
            slices.append(corr)
            last_idx = sent_idx + idx + len(initial)
        slices.append(self.text[last_idx:])
        return ''.join(slices)

    def ann_from_spelling(self, corrs):
        # create annotations and correct text from aspell output
        matches = [(m.group(0), m.start())
                   for m in re.finditer(r'[^\s\-]+', self.text)]
        matches = [
            x for x in matches if re.search('[0-9\\W]+', x[0]) is None
            or re.search('[0-9\\W]+', x[0]).group() != x[0]
        ]
        tokens, idx = zip(*matches)
        final_corrs = []
        anns = []
        for i, corr in enumerate(corrs):
            if corr is not None:
                tag = 'Spelling'
                start_idx = idx[i]
                end_idx = start_idx + len(corr[0])
                self.idx_list.add(end_idx, len(corr[0]) - len(corr[1]))
                anns.append(
                    ('%s %d %d\t%s' % (tag, start_idx, end_idx, corr[0]),
                     'AnnotatorNotes <ERROR>\t%s' % (corr[1])))
                final_corrs.append((start_idx, corr[0], corr[1]))
        self.text = self.correct_text(
            final_corrs,
            abs_idx=True)  # SHOULD BE SELF.TEXT WHEN IDXS ARE TACKLED
        return anns

    def ann_from_correction(self, corrs, tag):
        # start idx, sent start idx, initial np, predicted np
        anns = []
        for corr in corrs:
            start_idx = corr[0] + corr[1]
            end_idx = start_idx + len(corr[2])
            anns.append(('%s %d %d\t%s' %
                         (tag, self.idx_list.find_old_idx(start_idx),
                          self.idx_list.find_old_idx(end_idx), corr[2]),
                         'AnnotatorNotes <ERROR>\t%s' % (corr[3])))
        self.text = self.correct_text(corrs)
        return anns

    def tokenize(self):
        sents = self.sent_tokenizer.tokenize(self.text)
        sent_spans = self.sent_tokenizer.span_tokenize(self.text)
        tokens = [self.tokenizer.tokenize(sent) for sent in sents]
        idxs = [
            align_tokens(['"' if x in ['``', "''"] else x for x in toks], sent)
            for sent, toks in zip(sents, tokens)
        ]
        return sents, tokens, idxs, sent_spans

    def compile_annotation(self, path='.'):
        # collect all corrections
        sents, tokens, idxs, sent_spans = self.tokenize()
        with open(path + '/initial_sents.txt', 'w', encoding='utf-8') as f:
            f.write('\n'.join(sents))
        spelling = very_dummy_spellchecker(path)
        print('Spelling')
        spell_anns = self.ann_from_spelling(spelling)
        self.anns.extend(spell_anns)
        #print([self.text])
        #print(self.idx_list)
        with open(path + '/corrected_spelling.txt',
                  'w',
                  encoding='utf-8',
                  newline='') as f:
            f.write(self.text)
        print('Tokenizing')
        sents, tokens, idxs, sent_spans = self.tokenize()
        #with open('init_sents_for_prepositions_test_parsed.txt','r',encoding='utf-8') as f:
        #    trees = parse_tree(f.read())
        ##        #agr_corrs = check_agreement(trees,sent_spans)
        print('Tagging')
        tsents = self.st.tag_sents(tokens)
        print('Prepositions')
        prep_corrector = PrepositionCorrector()
        prep_corrs, prep_idxs = prep_corrector.detect_errors(
            self.w2v_model, tokens, tsents, idxs, sents, sent_spans)
        prep_anns = self.ann_from_correction(prep_corrs, 'Prepositions')
        for idx in prep_idxs:
            self.idx_list.add(idx[0], idx[1])
##        print(self.idx_list)
        with open(path + '/corrected_prepositions.txt', 'w',
                  encoding='utf-8') as f:
            f.write(self.text)
        self.anns.extend(prep_anns)
        print('Articles')
        sents, tokens, idxs, sent_spans = self.tokenize()
        tsents = self.st.tag_sents(tokens)
        art_corrector = ArticleCorrector()
        art_corrs, art_idxs = art_corrector.detect_errors(
            self.w2v_model, tokens, tsents, idxs, sents, sent_spans)
        art_anns = self.ann_from_correction(art_corrs, 'Articles')
        for idx in art_idxs:
            self.idx_list.add(idx[0], idx[1])
        with open(path + '/corrected_articles.txt', 'w',
                  encoding='utf-8') as f:
            f.write(self.text)
        self.anns.extend(art_anns)
        print('Writing annotation')
        self.write_annotation()

    def write_annotation(self):
        with open(self.filename[:-4] + '.ann', 'w', encoding='utf-8') as f:
            for i, ann in enumerate(self.anns):
                f.write('T%d\t'%(i+1)+ann[0]+'\n'+\
                        '#%d\t'%(i+1)+ann[1].replace('<ERROR>','T%d'%(i+1))+'\n')
from nltk import StanfordPOSTagger

text = ''' الفيتامينات هي عناصر غذائيّة أساسية لجسم الإنسان، وهي عبارة عن مركبات عضويّة توجد طبيعيّاً في الأغذية ويحتاجها الجسم بكميّات بسيطة 
للقيام بوظائفه الطبيعية، ولا يستطيع الجسم تصنيعها أو تصنيع كميّات كافية منها لتلبي احتياجاته'''

Tagger = StanfordPOSTagger(
    './stanfor arabic modeal and tagger/arabic.tagger',
    './stanfor arabic modeal and tagger/stanford-postagger.jar')
output = Tagger.tag(text.split())
output = [tuple(filter(None, tp)) for tp in output]  #remove empty tubles

for data in output:
    print(data[0].split("/")[0] + " > " + data[0].split("/")[1] + "\n")

# References:‬
# ‏‪1. Stanford Arabic part-of-speech tagset‬
# ‏‪https://www.sketchengine.co.uk/stanford-arabic-part-of-speech-tagset/‬
# ‏‪2. Stanford POS tagger‬
# ‏‪https://nlp.stanford.edu/software/pos-tagger-faq.html#tagset‬
Exemplo n.º 20
0
    def __init__(self, models_path=None):
        models_path = models_path or os.environ["MODELS_PATH"]
        jar_file = Path(models_path, "stanford-postagger.jar")
        tagger_file = Path(models_path, "spanish.tagger")

        self.tagger = StanfordPOSTagger(str(tagger_file), str(jar_file))
Exemplo n.º 21
0
class ActionListGenerator:
    def __init__(self,sentence, graph):
        self.Construct_Pattern_House()
        self.sentence = sentence
        self.rdfgraph = graph
       # self.sentence = sentence
        self.st = StanfordPOSTagger('chinese-distsim.tagger')
        self.nodecount = dict()

    def Construct_Pattern_House(self):
        self.patterns = []
        self.patterns.append([u'当 (N) (V) (N) 时', 'event'])
        self.patterns.append([u'{哪} () [的]{0,1} (N) [的]{0,1} 股价 {涨幅} [会]{0,1} [最大|最多]', 'stock_increase'])
        self.patterns.append([u'{哪} (N) 股 [的|将]{0,1}  {涨} [会]{0,1} [得]{0,1}  [最大|最多]', 'specific_type_stock_increase'])

    def Generate(self):
        self.words = jieba.cut(self.sentence)
        self.sentence2 = ' '.join(list(self.words))
        self.pos = self.st.tag(self.sentence2.split())

        self.senpos = [(sp.split('#')[0], sp.split('#')[1]) for _, sp in self.pos]
        print self.sentence2
        print self.pos

        self.actions = ActionList(self.rdfgraph)

        for pat in self.patterns:
            self.match(self.senpos, pat[0], pat[1])

        print self.actions

    def GetCount(self, pattype):
        if pattype in self.nodecount:
            ID = self.nodecount[pattype]
            self.nodecount[pattype] += 1
            return ID
        else:
            self.nodecount[pattype] = 1
            return 0


    def match(self, senpos, pattern, pattype):
        patarr = pattern.split()
        paralist = []
        i=0
        canmatch = True
        while i < len(senpos):
            canmatch = True
            regextra = 0
            j = 0
            while j < len(patarr):
                if patarr[j][0]=='(':
                    if patarr[j][1:-1] in senpos[i+j + regextra][1]:
                        paralist.append(senpos[i+j + regextra][0])
                    else:
                        canmatch = False
                        break
                elif patarr[j][0]=='[':
                    contentstr = patarr[j].split(']')[0][1:]
                    contents = contentstr.split('|')
                    if patarr[j][-1]=='}':
                        times = patarr[j].split('{')[1][:-1].split(',')
                        minimum_allowed_occurance = int(times[0])
                        maximum_allowed_occurance = int(times[1])
                        repeat = 0
                        for repeatednum in range(minimum_allowed_occurance, maximum_allowed_occurance + 1):
                            if senpos[i + j + regextra + repeatednum][0] in contents:
                                repeat = repeatednum
                            else:
                                if repeatednum == 0:
                                    regextra -= 1
                                else:
                                    regextra += repeat
                                break
                    else:
                        if senpos[i + j + regextra][0] in contents:
                            pass
                        else:
                            canmatch = False
                            break

                elif patarr[j][0]=='{':
                    content = patarr[j][1:-1]
                    if content in senpos[i+j + regextra][0]:
                        pass
                    else:
                        canmatch = False
                        break


                elif patarr[j] == senpos[i+j + regextra][0]:
                    pass
                else:
                    canmatch = False
                    break

                j+=1

            if canmatch:
                break
            else:
                paralist = []

            i += 1




        ID = lambda x: str(self.GetCount(x))
        if pattype == 'event':
            if len(paralist) != 3 or not canmatch:
                return []

            tid =  ID('t')

            res  = ['SELECT ?t'+ tid, "  WHERE   ", "{ "]
            NodeID = ID(pattype)
            res.append('?event'+NodeID + ' <http://www.example.org/subject>  \"' + paralist[0]+'\" .')
            res.append('?event'+NodeID + ' <http://www.example.org/trigger> \"' + paralist[1]+'\" .')
            res.append('?event'+NodeID + ' <http://www.example.org/object> \"' + paralist[2]+'\" .')
            res.append('?event'+NodeID + ' <http://www.example.org/time>  ?t' + tid + '  .')
            res.append('}')


            command = '\n'.join(res)

            act = Action('sparql')
            act.setCommand(command)
            act.inputtype = 'None'
            act.keydict['subject'] = paralist[0]
            act.returntype = 'value'
            self.actions.add(act)


        elif pattype == 'stock_increase':
            if  not canmatch:
                return []

            if len(paralist) == 1:
                companyname = self.actions[-1].keydict['subject']
                pass
            elif len(paralist) == 2:
                companyname = paralist[0]
                pass

            res = ['SELECT ?support ?p  ', "WHERE   ", "{ "]
            NodeID = ID('company')
            res.append('?company'+NodeID + ' <http://www.example.org/support>  ?support .')
            res.append('?company'+NodeID + ' <http://www.example.org/name> \"' + companyname +'\" .')
            supportNodeID = ID('supportnode')
            stockNodeID = ID('stocknode')
            res.append('?supportnode'+supportNodeID + ' <http://www.example.org/name>  ?support .')
            res.append('?supportnode'+supportNodeID + ' <http://www.example.org/stock>  ?stock'+stockNodeID + ' .')
            res.append('?stock'+stockNodeID + ' <http://www.example.org/stocktime>  \"%s\" .')
            res.append('?stock'+stockNodeID + ' <http://www.example.org/price>  ?p .')
            res.append('}')
            command = '\n'.join(res)

            act = Action('sparql')
            act.inputtype = 'timestamp'
            act.setCommand(command)
            self.actions.add(act)

            act1 = copy.deepcopy(act)
            act1.inputtype = 'latertimestamp'
            self.actions.add(act1)

            actminus = Action('minus')
            actminus.inputtype='table'
            self.actions.add(actminus)


            actmax = Action('max')
            actmax.inputtype='table'
            self.actions.add(actmax)

        elif pattype == 'specific_type_stock_increase':
            if  not canmatch:
                return []

            stocktype = paralist[0]

            res = ['SELECT ?company ?p  ', "WHERE   ", "{ "]
            companyNodeID = ID('company')
            stockNodeID = ID('stocknode')
            res.append('?companynode' + companyNodeID + ' <http://www.example.org/name>  ?company .')
            res.append('?companynode' + companyNodeID + ' <http://www.example.org/stock>  ?stock' + stockNodeID + ' .')
            res.append('?companynode' + companyNodeID + ' <http://www.example.org/type>  \"' + stocktype + '\" .')
            res.append('?stock' + stockNodeID + ' <http://www.example.org/stocktime>  \"%s\" .')
            res.append('?stock' + stockNodeID + ' <http://www.example.org/price>  ?p .')
            res.append('}')
            command = '\n'.join(res)

            act = Action('sparql')
            act.inputtype = 'timestamp'
            act.setCommand(command)
            self.actions.add(act)

            act1 = copy.deepcopy(act)
            act1.inputtype = 'latertimestamp'
            self.actions.add(act1)

            actminus = Action('minus')
            actminus.inputtype='table'
            self.actions.add(actminus)


            actmax = Action('max')
            actmax.inputtype='table'
            self.actions.add(actmax)
Exemplo n.º 22
0
        for docid, weight in postings:
            accumulator[docid] += weight
    return accumulator.most_common(k)

############ End of coppied code




#printing start time of the script
print("Start Time:",ctime())

#initializing taggers and modals from NLTK
os.environ["STANFORD_MODELS"] = "/chechi/Documents/StanfordNER"
stanford_NER_tagger = StanfordNERTagger('/Users/chechi/Documents/StanfordNER/english.all.3class.distsim.crf.ser.gz','/Users/chechi/Documents/StanfordNER/stanford-ner.jar')
stanford_POS_tagger = StanfordPOSTagger('/Users/chechi/Documents/StanfordNER/english-bidirectional-distsim.tagger','/Users/chechi/Documents/StanfordNER/stanford-postagger.jar')
stemmer = nltk.stem.PorterStemmer()



#This is the cache file that will store the precomputed best sentences and tags
#so that we dont have to tag each time we run this script
if(runOn=="DEV"):
    fname = 'bestSentencesTaggedDev.bin'
else:
    fname = 'bestSentencesTaggedTrain.bin'


#This variable will store all tagged most relevant sentences
NER_tagged = None
Exemplo n.º 23
0
from nltk.corpus import stopwords
import nltk
from nltk import ne_chunk, pos_tag, Tree
from nltk.stem import PorterStemmer
import re
import html
from nltk import StanfordPOSTagger, StanfordNERTagger
from feature_extraction.resources import cList

model_pos_tag = '../stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger'
jar_pos_tag = '../stanford-postagger-2018-10-16/stanford-postagger.jar'

model_en_tag = '../stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz'
jar_en_tag = '../stanford-ner-2018-10-16/stanford-ner-3.9.2.jar'

tagger_pos = StanfordPOSTagger(model_pos_tag, path_to_jar=jar_pos_tag, encoding='UTF-8')

tagger_en = StanfordNERTagger(model_en_tag, path_to_jar=jar_en_tag, encoding='UTF-8')

# preprocessing helper function to obtain string without html tags
def html_and_remove(entry):
    return re.sub(r'<.*?>', '', html.unescape(entry))

# aggregate function removing all html tags from data
def remove_html_tags(data):
    for count, entry in enumerate(data):
        print(count)
        entry['postText'][0] = html_and_remove(entry['postText'][0])
        entry['targetTitle'] = html_and_remove(entry['targetTitle'])
        entry['targetDescription'] = html_and_remove(entry['targetDescription'])
        entry['targetKeywords'] = html_and_remove(entry['targetKeywords'])