예제 #1
0
def segment_setences(words, lang="en"):

    content = " ".join(map(lambda word: word["text"], words))

    sentences = []

    left = 0

    splits = NNSplit.load(lang).split([content])

    for tokens2d in tqdm(splits):
        for tokens in tokens2d:

            text = "".join(map(lambda token: str(token), tokens)).strip()

            right = min(len(words), left + len(tokens)) - 1

            while right > 0 and not text.endswith(words[right]["text"]):
                right -= 1

            sentences.append({
                "start": words[left]["start"],
                "end": words[right]["end"],
                "text": text
            })

            left = right + 1

    return sentences
예제 #2
0
 def __init__(self, keyword, channel, contents_id):
     self.engine = create_engine(
         ("mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4").format(
             'root', 'robot369', '1.221.75.76', 3306, 'datacast2'))
     self.keyword = keyword
     self.channel = channel
     self.splitter = NNSplit.load("en")
     self.contents_id = contents_id
from nnsplit import NNSplit
from sentence_transformers import SentenceTransformer
import numpy as np
import h5py
from tqdm.auto import tqdm
import zlib
import pymongo
from mongo_proxy import MongoProxy
import json
from bson import ObjectId
import time
from threading import Thread, Lock
import gc
from guppy import hpy

splitter = NNSplit.load("en", use_cuda=True)

lock = Lock()


class JSONEncoder(json.JSONEncoder):
    def default(self, o):
        if isinstance(o, ObjectId):
            return str(o)
        return json.JSONEncoder.default(self, o)


db_pwd = "LTEG2pfoDiKfH29M"
client = MongoProxy(
    MongoClient(
        f"mongodb+srv://cdminix:{db_pwd}@cluster0.pdjrf.mongodb.net/Reviews_Data?retryWrites=true&w=majority"
예제 #4
0
    return DEFAULT_LANGUAGE_MODEL.split(s)


c = 'wethepeopleoftheunitedstatesinordertoformamoreperfectunionestablishjusticeinsuredomestictranquilityprovideforthecommondefencepromotethegeneralwelfareandsecuretheblessingsoflibertytoourselvesandourposteritydoordainandestablishthisconstitutionfortheunitedstatesofamerica'
d = 'WeholdthesetruthstobeselfevidentthatallmenarecreatedequalthattheyareendowedbytheirCreatorwithcertainunalienableRightsthatamongtheseareLifeLibertyandthepursuitofHappinessThattosecuretheserightsGovernmentsareinstitutedamongMenderivingtheirjustpowersfromtheconsentofthegovernedThatwheneveranyFormofGovernmentbecomesdestructiveoftheseendsitistheRightofthePeopletoalterortoabolishitandtoinstitutenewGovernmentlayingitsfoundationonsuchprinciplesandorganizingitspowersinsuchformastothemshallseemmostlikelytoeffecttheirSafetyandHappinessPrudenceindeedwilldictatethatGovernmentslongestablishedshouldnotbechangedforlightandtransientcausesandaccordinglyallexperiencehathshewnthatmankindaremoredisposedtosufferwhileevilsaresufferablethantorightthemselvesbyabolishingtheformstowhichtheyareaccustomedButwhenalongtrainofabusesandusurpationspursuinginvariablythesameObjectevincesadesigntoreducethemunderabsoluteDespotismitistheirrightitistheirdutytothrowoffsuchGovernmentandtoprovidenewGuardsfortheirfuturesecuritSuchhasbeenthepatientsufferanceoftheseColoniesandsuchisnowthenecessitywhichconstrainsthemtoaltertheirformerSystemsofGovernmentThehistoryofthepresentKingofGreatBritainisahistoryofrepeatedinjuriesandusurpationsallhavingindirectobjecttheestablishmentofanabsoluteTyrannyovertheseStatesToprovethisletFactsbesubmittedtoacandidworld'
r = 'HowdymynameisBrittanyPitcherandiamanelectricalengineeringmajorfromspringtxbutmostimportantlyiamtheloudestandproudestmemberofthefightingtexasaggieclassoftwentytwentyoneawhoop'
z = 'hellomynameisbrittanypitcherandmyfavoritecolorismarooniaminseniordesignrightnowthisiswhyiamworkingonthisprojectitismeanttohelpthosewhoarehardofhearingordeaftoovercomelanguagebarrierswiththeirpeersiamexcitedforittobefinishedandtodeterminghowwellitworks'

#create string of r, c, d
c = " ".join(split(c))
d = " ".join(split(d))
r = " ".join(split(r))
z = " ".join(split(z))

#try to split sentences

from nnsplit import NNSplit
splitter = NNSplit.load("en")

splits = splitter.split([res])[0]

i = len(splits) - 1
#split can be iterated over
for sentence in splits:
    print(sentence, end='')
    if (i > 0):
        print("\b.")
        i = i - 1
    else:
        print('.')