def GetData(path, to_lower=False, remove_proper_nouns=False):
    files = os.listdir(path)
    corpus = ""
    for file in files:
        with open(os.path.join(path, file), "r") as file:
            file = file.readlines()
            for line in file:
                corpus += line

    corpus = re.sub("\n\n+", ". ", corpus)
    corpus = re.sub("\n", " ", corpus)
    tokenizer = nltk.PunktSentenceTokenizer()
    corpus = tokenizer.tokenize(corpus)

    data = []
    count_dict = {}
    for sentence in corpus:
        sentence = nltk.word_tokenize(sentence)
        if(remove_proper_nouns):
            sentence = RemoveProperNouns(sentence)
        sentence = RemovePunctuations(sentence, to_lower=to_lower)
        if(len(sentence) > 0):
            for word in sentence:
                if(word in count_dict):
                    count_dict[word] += 1
                else:
                    count_dict[word] = 1

            data.append(sentence)

    return data, count_dict
示例#2
0
def _tokenize_by_sentence(text, return_spans):
    tokenizer = nltk.PunktSentenceTokenizer()

    if return_spans:
        return tokenizer.span_tokenize(text)
    else:
        return tokenizer.tokenize(text)
示例#3
0
    def file_read(self, input_text):

        doc = (file.read(file(input_text))).decode('utf-8', 'replace')

        #Sentence tokenizing
        doc = ' '.join(doc.strip().split('\n'))
        sentence_tokenizer = nltk.PunktSentenceTokenizer()
        sentences = sentence_tokenizer.tokenize(doc)
        return sentences
    def file_read(self, input_text):
        with open(input_text, encoding='utf-8', errors='replace') as docfile:
            doc = docfile.read()

        #Sentence tokenizing
        doc = ' '.join(doc.strip().split('\n'))
        sentence_tokenizer = nltk.PunktSentenceTokenizer()
        sentences = sentence_tokenizer.tokenize(doc)
        return sentences
示例#5
0
def process_all_reviews(reviews):
    me.connect("data-mining")
    for review in reviews:
        custom_sent_tokenizer = nltk.PunktSentenceTokenizer(review.review)
        sentences = custom_sent_tokenizer.tokenize(review.review)
        sent_list = review_description_sentence_list(sentences)

        item = MongoGansevoortReview(corresponding_id=review.id,
                                     paragraph=sent_list,
                                     description=review.review,
                                     date=review.date)
        item.save()
示例#6
0
def nltk_punkt_sentence_tokenizer(input_dict):
    """
    A sentence tokenizer which uses an unsupervised algorithm to build
    a model for abbreviation words, collocations, and words that start
    sentences; and then uses that model to find sentence boundaries.
    This approach has been shown to work well for many European
    languages.

    :param input_dict (default): {}
    :returns tokenizer: A python dictionary containing the Tokenizer object and its arguments.
    """

    return {'tokenizer': {'object': nltk.PunktSentenceTokenizer()}}
示例#7
0
def get_important_sent(html_content):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'miss', 'prof', 'inc', 'no', 'cap', 'nos',
        'vol', 'para', 'exh'
    ])
    tokenizer = nltk.PunktSentenceTokenizer(punkt_param)
    soup = BeautifulSoup(html_content, 'html.parser')
    content = soup.get_text()
    paras = get_paras(content)
    sents = []
    for para in paras:
        para_content = content[para[0]:para[1] + 1]
        for sent in tokenizer.span_tokenize(para_content):
            sents.append(para_content[sent[0]:sent[1] + 1])
    sents = np.array(sents)
    BertTokenizer = bert.bert_tokenization.FullTokenizer(VOCAB_FILE,
                                                         do_lower_case=True)
    input_ids, input_mask, segment_ids = convert_all_sentences(
        clean_data(sents), BertTokenizer)
    model = tf.keras.models.load_model("bert_model")
    input_X = {
        "input_ids": input_ids,
        "input_mask": input_mask,
        "segment_ids": segment_ids
    }
    sents = sents[(model.predict(input_X, batch_size=1) > 0.4).reshape(-1, )]
    for sent in sents:
        segs = filter(lambda seg: seg != "", sent.split("\n"))
        for seg in segs:
            seg = seg.replace("\xa0", " ")
            while seg:
                cur = len(seg)
                while True:
                    if not cur:
                        return html_content
                    cur_str = seg[:cur]
                    res = html_content.find(cur_str)
                    if res == -1:
                        cur -= 1
                    else:
                        html_content = add_important_class(
                            html_content, res, res + len(cur_str))
                        seg = seg[cur:]
                        break
    return html_content
示例#8
0
    def __init__(self, gid, gdata):
        assert self.TYPE

        self.gid = gid
        self.gdata = gdata

        self.tags = []
        self.media = []
        self.comments = []

        self.content = None
        self.title = None

        self.published = date_parse(self.gdata['published'])
        self.updated = date_parse(self.gdata['updated'])

        # Convert content to HTML so we can:
        #  * Determine if the page has content
        #  * Create a better title
        H2T = html2text.HTML2Text()
        H2T.ignore_links = True
        H2T.ignore_images = True
        H2T.ignore_emphasis = True
        H2T.body_width = 0
        txtcontent = H2T.handle(self.gdata['object']['content'])
        lines = [x for x in txtcontent.split('\n') if x.strip()]

        if not lines:
            self.has_content = False
            self.title = None
        else:
            # Take the first sentence as the title
            tokenizer = nltk.PunktSentenceTokenizer()
            sentences = tokenizer.tokenize(lines[0])
            self.title = sentences[0].strip()

            # If we just have a link, guess we don't have a title
            if self.title.startswith('http://') \
                    or self.title.startswith('https://'):
                self.title = None

            self.has_content = bool(sentences[1:]) or bool(lines[1:])

            # FIXME: Should we strip the title from the content?
            self.content = self.gdata['object']['content']
def trainSentenceTokenizer():
    """
    Method trains custom sentence tokenizer using punk.
    At the moment it preforms worse then plain english one (most likely due to not that much data)
    """
    collection = database["crawled-data"]

    text = ""
    for record in collection.find({ABSTRACT_DOCUMENT: {"$ne": None}}):
        text += record[ABSTRACT_DOCUMENT] + " "

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.INCLUDE_ABBREV_COLLOCS = True
    trainer.train(text)

    model = nltk.PunktSentenceTokenizer(trainer.get_params())
    with open("latvianPunkt2.pickle", mode='wb') as fout:
        pickle.dump(model, fout, protocol=pickle.HIGHEST_PROTOCOL)
示例#10
0
    def tokenizeDocument(self, doc, tokenize='sentence', returnDoc=False):

        if tokenize == 'sentence':
            spt = nltk.PunktSentenceTokenizer()

            if isinstance(doc, (pd.DataFrame, pd.Series)):
                tokens = doc.apply(
                    lambda row: spt.tokenize(' '.join(row))).values
            elif isinstance(doc, list):
                tokens = list()
                for each in doc:
                    tokens.append(spt.tokenize(each))
            else:
                tokens = spt.tokenize(doc)
        elif tokenize == 'word':

            wpt = nltk.WordPunctTokenizer()

            if isinstance(doc, (pd.DataFrame, pd.Series)):
                tokens = doc.apply(
                    lambda row: wpt.tokenize(' '.join(row))).values
            elif isinstance(doc, list):
                tokens = list()
                for each in doc:
                    tokens.append(wpt.tokenize(each))
            else:
                tokens = wpt.tokenize(doc)

        preProcObj = TextPreprocessor()

        #final_tokens = preProcObj.preprocess_text(tokens,[preProcObj.removeStopWords,preProcObj.removeNumbers,preProcObj.removeEmptyString],strFlag=False)
        #final_tokens = preProcObj.preprocess_text(tokens,[preProcObj.lowercase,preProcObj.lemmatize,preProcObj.removePunctuation,preProcObj.removeEmptyString,preProcObj.removehypen],strFlag=False)
        final_tokens = preProcObj.preprocess_text(tokens,
                                                  [preProcObj.lowercase],
                                                  strFlag=False)

        if returnDoc:
            # re-create document from filtered tokens
            doc = ' '.join(final_tokens)
            return final_tokens, doc
        else:
            return final_tokens
示例#11
0
def make_text_clear(text_in, synonymizer, morph):
    sentences = nltk.PunktSentenceTokenizer().tokenize(text_in.lower())
    text_items_in = []
    for sentence in sentences:
        for word in nltk.WordPunctTokenizer().tokenize(sentence):
            text_items_in.append(word)
    text_out = u''
    punct = set(string.punctuation)
    for item in text_items_in:
        if item not in punct:
            word_info = morph.parse(item)[0]
            norm = word_info.normal_form
            synonyms = synonymizer.synonymize(norm)
            print "-----------------"
            print("Word: %s" % norm)
            print("Synonyms:")
            for syn, freq in synonyms:
                print("%s : %s" % (syn, freq))
            print "-----------------"
            if len(synonyms) is not 0:
                best_synonym = synonyms[0][0]
                if synonymizer.need_replace(norm, best_synonym):
                    grammemes = [
                        word_info.tag.POS, word_info.tag.aspect,
                        word_info.tag.case, word_info.tag.mood,
                        word_info.tag.number, word_info.tag.person,
                        word_info.tag.tense, word_info.tag.voice
                    ]
                    tag = set(gram for gram in grammemes if gram is not None)
                    syn_info = morph.parse(best_synonym)[0].inflect(tag)
                    if syn_info is not None:
                        text_out += syn_info.word
                        text_out += ' '
                        continue
        text_out += item
        text_out += ' '
    return text_out
 def split_text(text):
     tokenizer = nltk.PunktSentenceTokenizer()
     return tokenizer.tokenize(text)
示例#13
0
import nltk

#loading speech of George W Bush
trainText = nltk.corpus.state_union.raw("2005-GWBush.txt")
sampleText = nltk.corpus.state_union.raw("2006-GWBush.txt")

#Punkt Sentence Tokeniser
punktSentenceTokeniser = nltk.PunktSentenceTokenizer(trainText)

sentTokens = punktSentenceTokeniser.tokenize(sampleText)


def processContent():
    try:
        for tokens in sentTokens:
            wordTokens = nltk.word_tokenize(tokens)
            tagged = nltk.pos_tag(wordTokens)

            # nameEnt = nltk.ne_chunk(tagged , binary=True)#adding binary= True will not classify Name Entity as location , money or something else.it just list as Name Entity
            nameEnt = nltk.ne_chunk(tagged)
            nameEnt.draw()

    except Exception as e:
        print(str(e))


processContent()
示例#14
0
from reviewsentencescore import maximalsentencenumber, sentencenumber, location, nw, phraseindicator, \
    reviewsentencescore


def sentenceimportance(ch, cr, rca):
    return (ch + cr + rca) / 3


dbname = "data-mining"
connect(dbname)
ReviewSentences.drop_collection()
Sentence.drop_collection()

for index, rev in enumerate(GansevoortReview.objects):

    custom_sent_tokenizer = nltk.PunktSentenceTokenizer(rev.description)
    sentences = custom_sent_tokenizer.tokenize(rev.description)
    sent_list = review_description_sentence_list(sentences)
    maximal = maximalsentencenumber(sent_list)

    CH = ReviewHelpfulness.objects.get(reviewId=rev.id).value
    CR = ReviewRecency.objects.get(reviewId=rev.id).value

    try:
        JOIN = UserReviews.objects.get(name=rev.name).id
        RCA = ReviewAuthorRepresentativeness.objects.get(authorId=JOIN).value
    except DoesNotExist:
        RCA = None
    except MultipleObjectsReturned:
        RCA = None
        print(f"two items were returned")
示例#15
0
def cut_text(full_text):
    list_of_sentences = nltk.PunktSentenceTokenizer().tokenize(full_text)
    sen_num = float(len(list_of_sentences))
    for s in list_of_sentences:
        cut_sentence(s)

    sum = 0.0
    for word in all_words:
        sum += len(word)  #сумма букв в предложении
        all_words_in_text.append(word)
        lemmatise(word)

    sym_count = 0
    word5 = 0
    word6 = 0
    word7 = 0
    word8 = 0
    word9 = 0
    word10 = 0
    word11 = 0
    word12 = 0
    word13 = 0
    for w in all_words_in_text:
        sym_count += len(w)
        if len(w) > 4:
            word5 += 1
        if len(w) > 5:
            word6 += 1
        if len(w) > 6:
            word7 += 1
        if len(w) > 7:
            word8 += 1
        if len(w) > 8:
            word9 += 1
        if len(w) > 9:
            word10 += 1
        if len(w) > 10:
            word11 += 1
        if len(w) > 11:
            word12 += 1
        if len(w) > 12:
            word13 += 1

    final_list.append(sym_count / sen_num)
    final_list.append(sym_count / float(len(all_words_in_text)))
    final_list.append(word5 / float(len(all_words_in_text)))
    final_list.append(word6 / float(len(all_words_in_text)))
    final_list.append(word7 / float(len(all_words_in_text)))
    final_list.append(word8 / float(len(all_words_in_text)))
    final_list.append(word9 / float(len(all_words_in_text)))
    final_list.append(word10 / float(len(all_words_in_text)))
    final_list.append(word11 / float(len(all_words_in_text)))
    final_list.append(word12 / float(len(all_words_in_text)))
    final_list.append(word13 / float(len(all_words_in_text)))
    del all_words_in_text[:]

    count = 0
    for vow_num in sent_words_in_vowels:
        count += vow_num
    final_list.append(count / sen_num)
    del sent_words_in_vowels[:]

    summ = 0
    for number in sum_sen_length:
        summ += number

    w = 0
    vow3 = 0
    vow4 = 0
    vow5 = 0
    vow6 = 0
    for vow in all_words_in_vowels:
        w += vow
        if vow > 2:
            vow3 += 1
        if vow > 3:
            vow4 += 1
        if vow > 4:
            vow5 += 1
        if vow > 5:
            vow6 += 1

    words_in_sent = float(len(all_words))
    final_list.append(w / sum)
    final_list.append(vow3 / words_in_sent)
    final_list.append(vow4 / words_in_sent)
    final_list.append(vow5 / words_in_sent)
    final_list.append(vow6 / words_in_sent)

    av_sen_length = summ / sen_num
    final_list.append(av_sen_length)
    av_word = sum / len(all_words)
    final_list.append(av_word)
    final_list.append(sum)
    final_list.append(len(all_words))

    finding(poses)
    # [av vowels in sent, av vowels in text, % of word5, % of word6, % of word7, % of word8, % of word9, % of word10, % of word11, % of word12, % of word13, % of vow3, % of vow4, % of vow5, % of vow6, av len of words in sent, av len of words in symbols, av # words in sent, av word len in words, text in symbols, text_in words, pos, pos, ..., pos]
    # тут типа все параметры кроме проверки на лексический минимум
    print final_list
    del final_list[:], sum_sen_length[:], all_words[:], poses[:], all_words_in_vowels[:]
示例#16
0
 def cut_sentence(self, full_text):
     list_of_sentences = nltk.PunktSentenceTokenizer().tokenize(full_text)
     for s in list_of_sentences:
         self.cut_words(s)
示例#17
0
from gensim import models
import nltk
import pandas as pd

wvec = models.Word2Vec
import os
MODEL_NAME = 'w2vmodel'

if os.path.exists(MODEL_NAME):
    model = wvec.load(MODEL_NAME)
else:
    # os.chdir('TrippyMain/AI')
    df_all = pd.read_pickle('../Warehouse/UdpRevFin.pkl')
    all_text = '. '.join(list(df_all['Text']))
    pk = nltk.PunktSentenceTokenizer()
    sentences = [nltk.word_tokenize(i) for i in pk.tokenize(all_text)]
    model = wv(sentences, min_count=5, size=500, workers=3)
    model.save(MODEL_NAME)


def sim(a, b):
    return model.wv.similarity(a, b)
示例#18
0
    paras, last_start = [], 0
    for sign in all_sign:
        paras.append((last_start, sign.span()[0] - 1))
        last_start = sign.span()[1]
    paras.append((last_start, len(content) - 1))
    return paras


RAW_DATA_FILE = 'raw/drug_labeled_20200408.json'
OUTPUT_DATA_FILE = 'data/drug_features_classification.csv'
punkt_param = PunktParameters()
punkt_param.abbrev_types = set([
    'dr', 'vs', 'mr', 'mrs', 'miss', 'prof', 'inc', 'no', 'cap', 'nos', 'vol',
    'para', 'exh'
])
tokenizer = nltk.PunktSentenceTokenizer(punkt_param)

with open(RAW_DATA_FILE, 'r', encoding='UTF-8') as input_f:
    lines = input_f.readlines()
    with open(OUTPUT_DATA_FILE, 'w', encoding='UTF-8') as output_f:
        csv_writer = csv.writer(output_f)
        csv_writer.writerow(['neutral_citation', 'sentence_id', 'sentence'] +
                            labels)
        with tqdm(total=len(lines), unit_scale=True) as pbar:
            for document_count, line in enumerate(lines):
                data = json.loads(line)
                neutral_citation, annotations = extract_annotations(
                    data["annotation"])
                if not neutral_citation: neutral_citation = document_count
                paras = get_paragraphs(data["content"])
                count = 0
示例#19
0
    def extract_case_refs(self,
                          referenced_by: Case,
                          content: str,
                          key: int = 0):
        """
        BVerwG, Urteil vom 20. Februar 2013, - 10 C 23.12 -
        BVerwG, Urteil vom 27. April 2010 - 10 C 5.09 -
        BVerfG, Beschluss vom 10.07.1989, - 2 BvR 502, 1000, 961/86 -
        BVerwG, Urteil vom 20.02.2013, - 10 C 23.12 -
        OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A -
        OVG Nordrhein-Westfalen, Urteil vom 29.10.2012 – 2 A 723/11 -
        OVG NRW, Urteil vom 14.08.2013 – 1 A 1481/10, Rn. 81 –
        OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 -
        OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 -
        Bayrischer VGH, Urteil vom 12.12.16, - 21 B 16.30364
        OVG Nordrhein-Westfalen, Urteil vom 21.2.2017, - 14 A 2316/16.A -
        Bayrischer VGH, Urteil vom 12.12.2016, - 21 B 16.30372 -
        OVG Saarland, Urteil vom 2.2.2017, - 2 A 515/16 -
        OVG Rheinland-Pfalz, Urteil vom 16.12.2016, -1A 10922/16 -
        VG Minden, Urteil vom 22.12.2016, - 1 K 5137/16.A -
        VG Gießen, Urteil vom 23.11.2016, - 2 K 969/16.GI.A
        VG Düsseldorf, Urteil vom 24.1.2017, - 17 K 9400/16.A
        VG Köln, Beschluss vom 25.03.2013 – 23 L 287/12 -
        OVG Schleswig, Beschluss vom 20.07.2006 – 1 MB 13/06 -
        Schleswig-Holsteinisches Verwaltungsgericht, Urteil vom 05.082014 – 11 A 7/14, Rn. 37 –
        Entscheidung des Bundesverwaltungsgerichts vom 24.01.2012 (2 C 24/10)

        EuGH Urteil vom 25.07.2002 – C-459/99 -

        TODO all court codes + case types

        - look for (Entscheidung|Bechluss|Urteil)
        - +/- 50 chars
        - find VG|OVG|Verwaltungsgericht|BVerwG|...
        - find location
        - find file number - ... - or (...)

        TODO

        Sentence tokenzier
        - remove all "special endings" \s([0-9]+|[a-zA-Z]|sog|Abs)\.
        - remove all dates

        :param key:
        :param content:
        :return:
        """

        refs = []
        original = content
        text = content

        # print('Before = %s'  % text)

        # Clean up text; replacing all chars that can lead to wrong sentences
        text = self.clean_text_for_tokenizer(text)

        # TODO
        from nltk.tokenize.punkt import PunktParameters
        punkt_param = PunktParameters()
        abbreviation = ['1', 'e', 'i']
        punkt_param.abbrev_types = set(abbreviation)
        # tokenizer = PunktSentenceTokenizer(punkt_param)

        offset = 0
        marker_offset = 0

        for start, end in nltk.PunktSentenceTokenizer().span_tokenize(text):
            length = end - start
            sentence = text[start:end]
            original_sentence = original[start:end]

            matches = list(re.finditer(r'\((.*?)\)', original_sentence))

            logger.debug('Sentence (matches: %i): %s' %
                         (len(matches), sentence))
            logger.debug('Sentence (orignal): %s' % (original_sentence))

            for m in matches:
                # pass
                # print('offset = %i, len = %i' % (offset, len(sentence)))
                #
                # print('MANGLED: ' + sentence)
                logger.debug('Full sentence // UNMANGLED: ' +
                             original_sentence)

                # focus_all = original[start+m.start(1):start+m.end(1)].split(',')
                focus_all = original_sentence[m.start(1):m.end(1)].split(',')

                # print(m.group(1))
                logger.debug('In parenthesis = %s' % focus_all)

                # Split
                for focus in focus_all:

                    # Search for file number
                    fns_matches = list(
                        re.finditer(self.get_file_number_regex(), focus))

                    if len(fns_matches) == 1:
                        fn = fns_matches[0].group(0)
                        pos = fns_matches[0].start(0)

                        logger.debug('File number found: %s' % fn)

                        # Find court
                        court_name = None
                        court_pos = 999999
                        court_matches = list(
                            re.finditer(self.get_court_name_regex(),
                                        original_sentence))

                        if len(court_matches) == 1:
                            # Yeah everything is fine
                            court_name = court_matches[0].group(0)

                        elif len(court_matches) > 0:
                            # Multiple results, choose the one that is closest to file number
                            for cm in court_matches:
                                if court_name is None or abs(
                                        pos - cm.start()) < court_pos:
                                    court_name = cm.group(0)
                                    court_pos = abs(pos - cm.start())
                        else:
                            # no court found, guess by search query
                            # probably the court of the current case? test for "die kammer"
                            pass

                        # Find date
                        # TODO

                        logger.debug('Filename = %s' % fn)
                        logger.debug('Courtname = %s' % court_name)

                        ref_start = start + m.start(1) + pos
                        ref_end = ref_start + len(fn)

                        if court_name is None:

                            # raise )
                            # TODO Probably same court as current case (use case validation)
                            logger.error(
                                AmbiguousReferenceError(
                                    'No court name found - FN: %s' % fn))
                            # logger.debug('Sentence: %s' % (fn, original_sentence)))
                            continue

                        ref_ids = [{
                            'type':
                            'case',
                            'ecli':
                            'ecli://de/' + slugify(court_name) + '/' +
                            slugify(fn.replace('/', '-'))
                        }]
                        # TODO maintain order for case+law refs
                        ref = CaseReferenceMarker(referenced_by=referenced_by,
                                                  text=focus,
                                                  start=ref_start,
                                                  end=ref_end,
                                                  line=0)  # TODO line number
                        ref.set_uuid()
                        ref.set_references(ref_ids)

                        refs.append(ref)

                        content, marker_offset = ref.replace_content(
                            content, marker_offset, key + len(refs))

                        pass
                    elif len(fns_matches) > 1:
                        logger.warning('More file numbers found: %s' %
                                       fns_matches)

                        pass
                    else:
                        logger.debug('No file number found')

        return content, refs