예제 #1
0
def transcript():

    if request.method == "POST":
        transcript_path = request.form.get('transcript_path')

        with open(transcript_path) as file:
            text = file.read()

        segmenter = DeepSegment('deepsegment_eng_v1/config.json')

        var = segmenter.segment(text)

        operational_1 = operational(var)

        length_chat_file = len(var)
        length_operational = len(operational_1)
        length_non_operational = length_chat_file - length_operational

   
        # Call function to create a pie chart showing Operational vs Non-Operational Problems
        # draw_figure(length_operational, length_non_operational)

        return render_template("showgraph.html", length_operational = length_operational, length_non_operational = length_non_operational)
        
    else:
        return redirect(url_for("index"))    
예제 #2
0
def segment(data):
	segmenter = DeepSegment('en')
	seg_text = []
	for text in data:
	  segmenter.segment_long(text),
	  seg_text.extend(segmenter.segment_long(text))
	return seg_text
예제 #3
0
def main():

    args = parse_args()

    with open(args.input, mode='r') as read_text_file:
        line = read_text_file.readline()

        segmenter = DeepSegment('en')
        corrector = DeepCorrect(args.params_path, args.checkpoint_path)

    with open(args.output, mode='w') as close_text_file:
        for part in segmenter.segment(line):
            tester2 = corrector.correct(part)
            close_text_file.write(tester2[0]['sequence'] + '\n')
예제 #4
0
def dataPreProcessModel():
    print("Inside dataPreProcessModel")
    global corrector
    corrector = DeepCorrect('model_params/deeppunct_params_en',
                            'model_params/deeppunct_checkpoint_google_news')
    global segmenter
    segmenter = DeepSegment('en')
예제 #5
0
def dataPreProcessModel():
    print("Inside dataPreProcessModel")
    global corrector
    corrector = DeepCorrect(
        '/Users/Amitgarg/Documents/SJSU/272-Ranjan/Smart-MOM/model_params/deeppunct_params_en',
        '/Users/Amitgarg/Documents/SJSU/272-Ranjan/Smart-MOM/model_params/deeppunct_checkpoint_google_news'
    )
    global segmenter
    segmenter = DeepSegment('en')
예제 #6
0
class DeepSegmenter(BaseSegmenter):
    """
    Designed with ASR outputs in mind, DeepSegment uses BiLSTM +
    CRF for automatic sentence boundary detection.
    It outperforms the standard libraries (spacy, nltk, corenlp ..)
    on imperfect text, and performs similarly for perfectly punctuated text.

    Example: 'I am Batman i live in gotham'
            ->  # ['I am Batman', 'i live in gotham']

    Details: https://github.com/notAI-tech/deepsegment

    :param lang_code: en - english (Trained on data from various sources);
        fr - french (Only Tatoeba data); it - italian (Only Tatoeba data)
    :type lang_code: str
    :param checkpoint_name: Name to be used as checkpoint
    :type checkpoint_name: str
    :param args:  Additional positional arguments
    :param kwargs: Additional keyword arguments
    """
    def __init__(self,
                 lang_code: str = 'en',
                 checkpoint_name: str = None,
                 *args,
                 **kwargs):
        """Set constructor."""
        super().__init__(*args, **kwargs)
        self.lang_code = lang_code
        self.checkpoint_name = checkpoint_name

    def post_init(self):
        from deepsegment import DeepSegment
        self._segmenter = DeepSegment(self.lang_code,
                                      checkpoint_name=self.checkpoint_name)

    @single
    def segment(self, text: str, *args, **kwargs) -> List[Dict]:
        """
        Split the text into sentences.

        :param text: Raw text to be segmented
        :type text: str
        :param args:  Additional positional arguments
        :param kwargs: Additional keyword arguments
        :return: List of sub-docuemnt dicts with the cropped images
        :rtype: List[Dict]
        """

        results = []
        for idx, s in enumerate(self._segmenter.segment_long(text)):
            results.append(dict(text=s, offset=idx, weight=1.0))
        return results
예제 #7
0
 def predict(self, sample_text, word_length, segment, verbose):               #A text seed is provided
 
     '''Predicts the next text sequences'''
     #model = self.model    
     for wordLength in range(word_length):   #Generates a text with a range of word length
         tokenList = self.tokenizer.texts_to_sequences([sample_text])[0]  #Turns the seed into sequences
         tokenList = pad_sequences([tokenList], maxlen=self.maxSequenceLen - 1, padding=self.padding_method)
         predicted = self.model.predict_classes(tokenList, verbose=verbose) #Predicts the next sequence(generated
         outputWord = " "                                         #text)  
         for word, index in self.tokenizer.word_index.items():
             if index == predicted:
                 outputWord = word
                 break
         sample_text += " " + outputWord
         #Returns the seed plus generated text
     self.sample_text = sample_text
     if segment == True:
         segmenter = DeepSegment('en')
         result = segmenter.segment(self.sample_text)
         sample_text = result
     else:
         print(sample_text)
         sample_text = self.sample_text
     return sample_text
예제 #8
0
def main():
    segmenter = DeepSegment('en')
    connect = pymysql.connect(**config)
    cursor = connect.cursor()
    cursor.execute('SELECT Id, Name, Caption FROM video')
    results = cursor.fetchall()
    new_results = []
    for result in tqdm(results):
        for playlist in playlist_list:
            if re.search(r'^' + playlist + '_\d+', result['Name']):
                new_result = {}
                new_result['caption'] = ', '.join(
                    segmenter.segment_long(result['Caption']))
                new_result['name'] = result['Name']
                new_result['id'] = result['Id']
                new_results.append(new_result)
    new_results = rectify(new_results)
    for new_result in tqdm(new_results):
        cursor.execute('UPDATE video SET Caption=%s WHERE Id=%s',
                       (new_result['caption'], new_result['id']))
    print(cursor.execute('SELECT * FROM video'))
    connect.commit()
    cursor.close()
    connect.close()
예제 #9
0
def processing(id):
    paragraph_object = Paragraph.objects.get(id=id)
    if not hasattr(globals, 'corrector') and not hasattr(globals, 'segmenter'):
        segmenter = DeepSegment('en')
        corrector = DeepCorrect('deep_punc/deeppunct_params_en',
                                'deep_punc/deeppunct_checkpoint_wikipedia')
        globals.corrector = corrector
        globals.segmenter = segmenter
    else:
        corrector = globals.corrector
        segmenter = globals.segmenter

    list_of_sentences = segmenter.segment(paragraph_object.original_text)
    paragraph = ''
    for i in range(len(list_of_sentences)):
        sentence = corrector.correct(list_of_sentences[i])
        if i == 0:
            paragraph += sentence[0]['sequence']
        else:
            paragraph += ' ' + sentence[0]['sequence']
    paragraph = paragraph.replace("\\", "")
    paragraph_object.processed_text = paragraph
    paragraph_object.processing = False
    paragraph_object.save()
예제 #10
0
파일: split.py 프로젝트: xubiuit/jina
class DeepSegmenter(BaseSegmenter):
    """
    Designed with ASR outputs in mind, DeepSegment uses BiLSTM + CRF for automatic sentence boundary detection. It significantly outperforms the standard libraries (spacy, nltk, corenlp ..) on imperfect text and performs similarly for perfectly punctuated text.

    Example: 'I am Batman i live in gotham'
            ->  # ['I am Batman', 'i live in gotham']

    Details: https://github.com/notAI-tech/deepsegment
    """
    def __init__(self,
                 lang_code: str = 'en',
                 checkpoint_name: str = None,
                 *args,
                 **kwargs):
        """

        :param lang_code: en - english (Trained on data from various sources); fr - french (Only Tatoeba data); it - italian (Only Tatoeba data)
        :param args:
        :param kwargs:
        """
        super().__init__(*args, **kwargs)
        self.lang_code = lang_code
        self.checkpoint_name = checkpoint_name

    def post_init(self):
        from deepsegment import DeepSegment
        self._segmenter = DeepSegment(self.lang_code,
                                      checkpoint_name=self.checkpoint_name)

    def craft(self, text: str, doc_id: int, *args, **kwargs) -> List[Dict]:
        """
        Split the text into sentences.

        :param text: the raw text
        :param doc_id: the doc id
        :return: a list of chunk dicts with the cropped images
        """

        results = []
        for idx, s in enumerate(self._segmenter.segment_long(text)):
            results.append(dict(text=s, offset=idx, weight=1.0))
        return results
예제 #11
0
# https://github.com/bminixhofer/nnsplit
# =============================================================================
if False:
    from nnsplit import NNSplit

    splitter = NNSplit("de")

    res = splitter.split([data])

# =============================================================================
# More advanced: Deepsegment: Does not support German
# =============================================================================
if False:
    from deepsegment import DeepSegment
    # The default language is 'en'
    segmenter = DeepSegment('de')

    with open('data/start.txt', 'r') as myfile:
        data = myfile.read()

    segmenter.segment('I am Batman i live in gotham')

# =============================================================================
# Huggingface tokenizer
# =============================================================================

if False:
    from tokenizers.implementations import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    from pathlib import Path
예제 #12
0
index = 0
for test_data in test_datas:
    results[str(index)] = {}
    index = index + 1

index = 0
for test_data in test_datas:
    print(test_data)

    for data_set in test_datasets:
        results[str(index)][data_set] = {}

    for data_set in test_datasets:
        print(data_set)
        f_checkpoint = data_set + "/checkpoint"
        f_params = data_set + "/params"
        f_utils = data_set + "/utils"
        segmenter = DeepSegment(lang_code=None,
                                checkpoint_path=f_checkpoint,
                                params_path=f_params,
                                utils_path=f_utils,
                                tf_serving=False,
                                checkpoint_name=None)
        res = segmenter.segment_long(test_data, n_window=i_window)
        results[str(index)][data_set] = res

    index = index + 1

################################################################################################################################################
print(results)
                continue
            else:
                yield from instances


# srl tagger
predictor = Predictor.from_path(
    "https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz",
    cuda_device=0)

# segmentation model for splitting ill-formed utterances into well-formed sentences
logging.disable(logging.WARNING)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(
    gpu_devices[0], True)  # do not take all GPU memory - tagger needs some too
segmentation = DeepSegment('en')

# utterances
utterances_path = config.Dirs.data / 'training' / f'{CORPUS_NAME}_mlm.txt'
params = Params.from_param2val(param2default)
utterances = load_utterances_from_file(utterances_path)

it = gen_instances()

progress_bar = pyprind.ProgBar(len(utterances) // BATCH_SIZE, stream=1)
num_no_verb = 0
num_only_verb = 0
lines = set()
outer_loop = True
while outer_loop:
예제 #14
0
import json
#from ibm_watson import ToneAnalyzerV3

from watson_developer_cloud import ToneAnalyzerV3

import paralleldots as pd

from deepsegment import DeepSegment

from get_emotions import get_emotion_counts_with_vader

segmenter = DeepSegment('en')

pd.set_api_key( "yGZxjt2pV3Y3V0FizvQGCygybaLHGZRU0rvTNnSLlp8" )

tone_analyzer = ToneAnalyzerV3(
    version='2017-09-21',
    iam_apikey='0DWwlEM6RsPb0nnawbE3Rzbpmrg9OOLcLA5xJOel17wN',
    url='https://gateway-syd.watsonplatform.net/tone-analyzer/api'
)

"""
text = 'Team, I know that times are tough! Product '\
    'sales have been disappointing for the past three '\
    'quarters. We have a competitive product, but we '\
    'need to do a better job of selling it!'

"""

li_B= ['since 1990 the number of gun deaths', 'worldwide has reached six point five', 'million three quarters of gun deaths', 'occur in just 15 countries Latin America', 'is home to some of the worlds most', 'violent countries by murder rate El', 'Salvador Venezuela and Guatemala are the', 'top three countries for deaths caused by', 'guns per population these Latin American', 'countries are marred by corruption', 'organized crime and a dysfunctional', 'criminal justice system that further', 'fuels the problem the availability of', 'guns in the United States is another', 'concern for these countries an estimated', '200,000 guns a year that were first sold', 'in the United States are smuggled over', 'the southern border and used in violent', 'crimes in Latin America and the', 'Caribbean in the United States the', 'constitutional right to bear arms has', 'led to looser regulations and easier', 'access to firearms this contributes to', 'the 30,000 men women and children who', 'were killed with guns each year mass', 'shootings attract their headlines but in', 'fact these make up only 0.2% of gun', 'deaths 60% of gun related deaths are in', 'fact suicide', "America's suicide rate increased by 25", 'percent between 1999 and 2015 of nearly', '45,000 taking their own lives in 2015', 'alone half of these suicides were', "carried out with guns though guns aren't", 'the most common method of suicide they', 'are the most lethal other wealthy', 'countries have far lower rates of gun', 'violence in Japan if you want to own a', 'gun you must pass a written exam and a', 'shooting range test alongside a series', 'of mental health drug in criminal record', 'tests', 'it has virtually eradicated gun crime', 'after a mass shooting in 1996 Australia', 'introduced an effective buyback scheme', 'of firearms in the 20 years following', 'the bag there was an accelerated decline', 'in total gun deaths but in America the', 'House of Representatives has not voted', 'on a single measure to prevent gun', 'violence and in some states such as', 'Texas where students at public colleges', 'can now carry concealed handguns the law', 'has actually loosened easy access to', 'firearms will continue to be the main', 'driver of Americas gun debt']
예제 #15
0
import urllib.request
from bs4 import BeautifulSoup
from deepsegment import DeepSegment

# load things:
VALID_CHARS = set("abcdefghijklmnopqrstuvwxyz123456789. ")
nlp = spacy.load("en_core_web_md")
merge_ncs = nlp.create_pipe("merge_noun_chunks")
merge_ents = nlp.create_pipe("merge_entities")
nlp.add_pipe(merge_ents)
nlp.add_pipe(merge_ncs)

model = fasttext.load_model(
    os.path.join(os.path.dirname(os.path.realpath(__file__)),
                 "model_1000000.ftz"))
segmenter = DeepSegment("en")

########################
# Function definitions #
########################


def preprocess_text(text):
    """ simplify the text before fasttext processing. """
    return "".join(c for c in text.lower() if c in VALID_CHARS)


#####################
# Class definitions #
#####################
예제 #16
0
class Decoder:
    def __init__(self, name: str, bit_rate: int, iteration: int = 1, max_active: int = 10000,
                 max_batch_size=50) -> None:
        super().__init__()
        self.name = name
        self.bit_rate = bit_rate
        self.segmenter = None
        self.use_feedback = False

        self.env = os.environ.copy()
        self.env["ITERATIONS"] = str(iteration)
        self.env["MAX_ACTIVE"] = str(max_active)
        self.env["MAX_BATCH_SIZE"] = str(max_batch_size)

        self.model_dir = os.path.join(
            "/workspace/nvidia-examples/", name.lower())
        self.result_dir = os.path.join("/tmp/results/", name.lower())
        self.prep_command = "prepare_data.sh"
        self.batch_feedback_command = "run_benchmark.sh"
        self.batch_command = "run_benchmark_org.sh"

        self.last_run = None
        # Decoding lock
        self.batch_lk = Lock()

        self.model_trainings = 0

    def initialize(self) -> None:
        prep_process = Popen(["/bin/bash", self.prep_command],
                             stdin=PIPE, stderr=PIPE, cwd=self.model_dir)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

    def init_segment(self):
        from deepsegment import DeepSegment
        self.segmenter = DeepSegment("en", tf_serving=False)

    def extract_corpora(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]:
        # noinspection PyBroadException
        try:
            trans_file = open(os.path.join(
                self.result_dir, str(batch_id), str(iter_id), "trans"))
            transcripts = trans_file.readlines()
            transcript_repo: Dict[str, List[str]] = {}
            for t in transcripts:
                spl = t.split(maxsplit=1)
                header = spl[0].split("_")[-1].split(".")[0]
                trans = spl[1]
                single_trans = open(os.path.join(self.result_dir, str(
                    batch_id), str(iter_id), "trans_" + header), "w")
                single_trans.write(trans)
                single_trans.close()

                archived = open(os.path.join("/root/audio/batch" +
                                             str(batch_id), "tran_" + header + ".txt"), "w")
                archived.write(trans)
                archived.close()

                transcript_repo[header] = trans.split()

            trans_int_file = open(os.path.join(self.result_dir, str(
                batch_id), str(iter_id), "trans_int_combined"))
            transcript_ints = trans_int_file.readlines()
            transcript_int_repo: Dict[str, List[int]] = {}
            for t in transcript_ints:
                spl = t.split(maxsplit=1)
                header = spl[0].split("_")[-1].split(".")[0]
                trans = spl[1]
                single_trans = open(os.path.join(self.result_dir, str(
                    batch_id), str(iter_id), "trans_int_combined_" + header), "w")
                single_trans.write(trans)
                single_trans.close()
                transcript_int_repo[header] = list(
                    map(lambda x: int(x), trans.split()))

            cmt_file = open(os.path.join(self.result_dir, str(
                batch_id), str(iter_id), "CTM.ctm"))
            convo = cmt_file.readlines()
            # noinspection PyTypeChecker
            extraction: Dict[str, TextIOWrapper] = {}
            convo_repo: Dict[str, List] = {}
            for c in convo:
                conv = c.split()[3:]
                conv = [float(conv[0]), int(conv[1])]

                meta = c.split()[0]
                header = meta.split(".", maxsplit=1)[0].split("_")[-1]
                if header in extraction:
                    extraction[header].write(str(conv[0]) + " " + str(conv[1]))
                    convo_repo[header].append(conv)
                else:
                    fd = os.path.join(self.result_dir, str(
                        batch_id), str(iter_id), header + ".ctm")
                    # noinspection PyTypeChecker
                    extraction[header] = open(fd, "w")
                    convo_repo[header] = []
            for k in extraction.keys():
                extraction[k].close()
        except:
            logging.error("Failed for batch ", batch_id)
            return {}

        batch_out = {}
        for key in transcript_repo.keys():
            # noinspection PyBroadException
            # try:
            transcript_tokens = transcript_repo[key]
            transcript = ""
            for tt in transcript_tokens:
                transcript += (tt + " ")
            alignment, duration = Decoder.calculate_alignment(
                transcript_repo[key], transcript_int_repo[key], convo_repo[key])

            logging.debug("Alignment complete for ", key)
            sentences = []

            # noinspection PyBroadException
            try:
                os.environ['CUDA_VISIBLE_DEVICES'] = '0'

                self.init_segment()
                sentences = self.segmenter.segment_long(transcript)

                use_lstm = True
            except Exception as e:
                logging.error(e)
                use_lstm = False
                tokens = transcript.split()
                for index in range(len(tokens)):
                    sentence_size = len(sentences)
                    if sentence_size < (int(index / 5) + 1):
                        sentences.append([])
                    word = tokens[int(index)]
                    sentences[int(index / 5)].append(word)

            w_dim = 0
            aligned_sentences = list()
            for s_raw in sentences:
                if use_lstm:
                    sentence = s_raw.split()
                else:
                    sentence = s_raw
                    sentence[-1] = sentence[-1] + "."

                aligned_sentence = list()
                for widx, word in enumerate(sentence):
                    w_dim += 0
                    Word(word, alignment[w_dim])
                    word_obj = Word(word, alignment[w_dim])
                    if widx == len(sentence) - 1:
                        word_obj.add_tag("is_punctuated", True)
                    aligned_sentence.append(word_obj)

                sentence_obj = Sentence(aligned_sentence, 0)
                aligned_sentences.append(sentence_obj)

                for idx, _ in enumerate(aligned_sentences):
                    if idx < (len(aligned_sentences) - 1):
                        aligned_sentences[idx].length = aligned_sentences[idx +
                                                                          1].words[0].timestamp
                aligned_sentences[(len(aligned_sentences) - 1)].length = duration

            transcript_out = {"duration": duration, "length": len(alignment), "sentences": aligned_sentences,
                              "complete": "1"}
            out_json = open(os.path.join(
                "/root/audio/batch" + str(batch_id), key + ".json"), "w")
            json.dump(transcript_out, out_json)
            out_json.close()
            batch_out[key] = transcript_out

        # Release GPU
        device = cuda.get_current_device()
        device.reset()
        return batch_out

    def decode_batch(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]:
        self.batch_lk.acquire(blocking=True)
        # set environment, start new shell
        batch_env = self.env
        batch_env["DATASET"] = os.path.join(
            "/root/audio/batch" + str(batch_id))

        # if self.use_feedback:
        prep_process = Popen(["/bin/bash", self.batch_feedback_command],
                             stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

        num_words = len(open(os.path.join("/tmp/results", self.name,
                                          str(batch_id), "0", "trans")).readlines()[0].split())
        # Fallback to original model if retrained model doesn't decode
        if num_words < 2:
            self.use_feedback = False
            shutil.rmtree(os.path.join(
                "/tmp/results", self.name, str(batch_id)))
            prep_process = Popen(["/bin/bash", self.batch_command],
                                 stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir)
            stdout, stderr = prep_process.communicate()
            logging.debug(stdout)
            logging.debug(stderr)

        # batch_env = self.env
        prep_process = Popen(["/usr/bin/gzip", "-d", os.path.join(
            self.result_dir, str(batch_id), str(iter_id), "lat_aligned.gz")], stdin=PIPE)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

        ctm_file = os.path.join(self.result_dir, str(
            batch_id), str(iter_id), "CTM.ctm")

        lattice_align_command: str = ""
        lattice_align_command += "/opt/kaldi/src/latbin/lattice-align-words-lexicon --partial-word-label=4324 " \
                                 "/workspace/models/aspire/data/lang_chain/phones/align_lexicon.int " \
                                 "/workspace/models/aspire/final.mdl"
        lattice_align_command += (" ark:" + os.path.join(self.result_dir,
                                                         str(batch_id), str(iter_id), "lat_aligned"))
        lattice_align_command += " ark:- | /opt/kaldi/src/latbin/lattice-1best ark:- ark:- | " \
                                 "/opt/kaldi/src/latbin/nbest-to-ctm ark:- "
        lattice_align_command += ctm_file
        prep_process = Popen(lattice_align_command, stdin=PIPE, shell=True)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

        corpora = self.extract_corpora(batch_id)

        if self.last_run is not None:
            self.last_run += 1
        else:
            self.last_run = 0

        self.batch_lk.release()
        return corpora

    def clear_results(self) -> None:
        if os.path.exists(self.result_dir):
            os.rmdir(self.result_dir)

    def train_model(self, fb: FeedbackAgent) -> None:
        fb.iter = self.model_trainings
        fb.lk = self.batch_lk
        fb.start()
        self.model_trainings += 1

    @staticmethod
    def calculate_alignment(words: List[str], idx: List[int], lats: List[List]) -> Tuple[List, float]:
        word_table: Dict[int, str] = dict()
        alignment = []

        len_words = len(words)
        len_idx = len(idx)
        len_lats = len(lats)
        assert len_words == len_idx
        if len_idx > len_lats:
            lats.insert(0, [0.0, idx[0]])
            if lats[0][0] == lats[1][0]:
                lats[1][0] = (lats[2][0] / 2)

        for i in range(len_idx):
            wt_idx = idx[i]
            word_table[wt_idx] = words[i]
        offset = 0.0
        for i in range(len_lats):
            original_lats = lats[i][0]
            if original_lats == 0.0:
                # noinspection PyBroadException
                try:
                    next_lat = lats[i + 1][0]
                    original_lats = next_lat / 2
                except:
                    original_lats = 0.05
            offset += original_lats
            lat_i = int(lats[i][1])
            w = word_table[lat_i]
            align = offset - lats[i][0]
            alignment.append([w, align])
        return alignment, offset

    @staticmethod
    def fetch_transcript(batch_id: int, corpus_id: str) -> object:
        out_json = open(os.path.join("/root/audio/batch" +
                                     str(batch_id), corpus_id + ".json"), "r")
        out_json = json.load(out_json)
        return out_json
예제 #17
0
from deepsegment import DeepSegment

m = DeepSegment()


def predictor(x, batch_size=32):
    return m.segment(x, batch_size=batch_size)
예제 #18
0
#pip install deepsegment

from deepsegment import DeepSegment

#declaring segmenter object
segmenter = DeepSegment()

#applying segmentation (tokenization)
segmenter.segment('I am Batman, I live in Gotham')
>>>['I am Batman, I live in Gotham']

#performs well even without punctuation
segmenter.segment('I am Batman i liv in gotham')
>>>['I am Batman', 'i liv in gotham']
예제 #19
0
 def post_init(self):
     from deepsegment import DeepSegment
     self._segmenter = DeepSegment(self.lang_code,
                                   checkpoint_name=self.checkpoint_name)
예제 #20
0
 def segmentsent(self, text):
     segmenter = DeepSegment('en')
     result = segmenter.segment(text)
     return result
예제 #21
0
def make_caption(video_dir, caption_dir, split_video_dir, frame_dir,
                 trash_dir_path, timecode_dir, video, pyscenedetect_threshold,
                 punct, classify_model, mode, tmp_annotation_dir):
    tmp_annotation_path = os.path.join(tmp_annotation_dir,
                                       video + '_annotation.json')
    if not os.path.exists(tmp_annotation_path):
        cv2.setNumThreads(1)
        print(f'{video} has been started.')

        video_name = video + ".mp4"
        video_path = os.path.join(video_dir, video_name)
        caption_path = os.path.join(caption_dir, (video + ".en.vtt"))
        video_elements_dir_path = os.path.join(split_video_dir, video)
        timecode_path = os.path.join(timecode_dir, video + ".pkl")
        trash_dir_path = os.path.join(trash_dir_path, video)
        if not os.path.exists(trash_dir_path):
            os.makedirs(trash_dir_path)

        if os.path.exists(timecode_path):
            print(f'[splitting]: {video} has been started. (loading...)')
            timecode_dict = load_pickle(timecode_path)
            print(
                f'[splitting]: {video} has been done. (timecode_list has been loaded.)'
            )

        else:
            print(f'[splitting]: {video} has been started.')
            timecode_list = split_video(video_path, video_name,
                                        video_elements_dir_path,
                                        pyscenedetect_threshold)
            video_elements = sorted(os.listdir(video_elements_dir_path))
            try:
                assert len(timecode_list) == len(
                    video_elements
                ), f'video:{video} timecode_list:{len(timecode_list)} video_elements:{len(video_elements)}'
            except AssertionError as err:
                print('AssertionError:', err)
            print(f'[splitting]: {video} has been done.')

            timecode_dict = {}
            for i, video_element in enumerate(video_elements):
                video_element_path = os.path.join(video_elements_dir_path,
                                                  video_element)
                is_useful = classify(
                    video_elements_dir_path, video_element,
                    os.path.join(frame_dir, video_name, video_element),
                    classify_model)

                if is_useful:
                    timecode_dict[video_element[:-4]] = timecode_list[i]
                else:
                    shutil.move(video_element_path, trash_dir_path)

            save_pickle(timecode_dict, timecode_path)

        if args.punct == 'deepsegment':
            segmenter = DeepSegment('en')
        elif args.punct == 'fastpunct':
            segmenter = FastPunct('en')
        else:
            raise Exception(
                'You have probably chosen something other than fastpunct and deepsegement.'
            )

        annotation_dict = {}

        for i, useful_element in enumerate(timecode_dict.keys()):
            useful_element_path = os.path.join(video_elements_dir_path,
                                               useful_element + '.mp4')
            capture = cv2.VideoCapture(useful_element_path)
            fps = capture.get(cv2.CAP_PROP_FPS)
            frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
            duration = frame_count / fps
            annotation_data = make_caption_data(useful_element, caption_path,
                                                timecode_dict[useful_element],
                                                duration, fps, punct, mode,
                                                segmenter)

            if len(annotation_data) == 0:
                shutil.move(useful_element_path, trash_dir_path)
                # timecodeからは消えない
                # raise Exception(f'Caption data is None: {useful_element}')
            else:
                # print(f'[caption]: {useful_element} has been done.')
                annotation_dict.update(annotation_data)

        with open(tmp_annotation_path, 'w') as f:
            json.dump(annotation_dict, f)

    else:
        print(f'[caption]: {video} annotation is already exist.')

    print(f'[split/caption]: {video} has been done.')
예제 #22
0
from deepsegment import DeepSegment
segmenter = DeepSegment(
    checkpoint_path=
    '/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/checkpoint',
    params_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/params',
    utils_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/utils')
# print(segmenter.segment('Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.'))
sent = 'Nhân viên kinh doanh và chăm sóc khách hàng 10/2015 - 04/2016 - Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.. -Hướng dẫn khách hàng về thủ tục hồ sơ để đăng ký chữ ký số và các phần mềm của công ty - Giải đáp thắc mắc của khách hàng và chuyển yêu cầu của khách hàng cho bộ phận liên quan'
print(segmenter.segment_long(sent))
예제 #23
0
import string
from util import *
from predefined import *

################################  Constants #######################################
N_WINDOW = 7
MODEL_PATH = "trained/altyazilar_not_456_senteces"
CHECKPOINT_PATH = MODEL_PATH + "/checkpoint"
PARAM_PATH = MODEL_PATH + "/params"
UTILS_PATH = MODEL_PATH + "/utils"
PREDEFINED_ENABLED = True

################################ Init DeepSegmenter ################################
segmenter = DeepSegment(lang_code=None,
                        checkpoint_path=CHECKPOINT_PATH,
                        params_path=PARAM_PATH,
                        utils_path=UTILS_PATH,
                        tf_serving=False,
                        checkpoint_name=None)


################################ Init predefined ################################
def normalize_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.strip().split())
    text = lower_tr(text)
    return text


PREDEFINED_LENGTH = len(PREDEFINED)
PREDEFINED.sort(key=len, reverse=True)
PREDEFINED_NORMALIZED = [normalize_text(x) for x in PREDEFINED]
예제 #24
0
def fn(test):
    
    from deepsegment import DeepSegment
    segmenter=DeepSegment('en')
    import textrazor
    textrazor.api_key = "043e170ef41a6d297a508581225bd493943f3a9f831345fb71f86d64"

    client = textrazor.TextRazor(extractors=["words", "relations"])
    #client.set_do_cleanup_HTML(True)

    response = client.analyze(test)
    l=[]

    for property in response.properties():
        for word in property.predicate_words:
            l.append(word.lemma)
            if word.lemma == "sound":
                for property_word in property.property_words:
                    for phrase in property_word.noun_phrases:
                        print (phrase)
                break
    l=[]
    flag=False
    for sentence in response.sentences():
        print(sentence.words)
        for word in sentence.words:
            if word.lemma=="image" or word.lemma=="picture" or word.lemma=="photo" or word.lemma=="show" or word.lemma=="see" or word.lemma=="display":
                k=word.lemma
                flag=True 
            l.append(word.lemma)
    astring=""
    for i in l:
        astring+=i+" "

    f=open("keyword.txt",'a')
    f.write(astring+"\n")
    f.close()
    alist=segmenter.segment(astring)
    print(alist)

    if(flag):
        s=l.index(k)
        m=l[s:]
    
        t=""
        st=""
        for i in m:
            t+=i+" "
    else:
        t="No image found"
        st=""
        for j in l:
            st+=j+" "

    text1=st
    text2=t

    print(t)
    response1=client.analyze(t)

    for noun in response1.noun_phrases():
        print(noun.words)
        for word in noun.words:
            print(word.lemma)


    from requests import exceptions
    import argparse
    import requests
    import cv2
    import os
    import time

    starttime=time.time();
    

    # set your Microsoft Cognitive Services API key along with (1) the
    # maximum number of results for a given search and (2) the group size
    # for results (maximum of 50 per request)
    API_KEY = "948886a19a794c428c53fcfa2aa0325b"
    MAX_RESULTS = 1
    GROUP_SIZE = 1
    
    # set the endpoint API URL
    URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"

    # when attempting to download images from the web both the Python
    # programming language and the requests library have a number of
    # exceptions that can be thrown so let's build a list of them now
    # so we can filter on them
    EXCEPTIONS = set([IOError, FileNotFoundError,
        exceptions.RequestException, exceptions.HTTPError,
        exceptions.ConnectionError, exceptions.Timeout])


    # store the search term in a convenience variable then set the
    # headers and search parameters
    term = t
    headers = {"Ocp-Apim-Subscription-Key" : API_KEY}
    params = {"q": term, "offset": 0, "count": GROUP_SIZE}
    
    # make the search
    print("[INFO] searching Bing API for '{}'".format(term))
    search = requests.get(URL, headers=headers, params=params)
    search.raise_for_status()
    
    # grab the results from the search, including the total number of
    # estimated results returned by the Bing API
    results = search.json()
    estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
    print("[INFO] {} total results for '{}'".format(estNumResults,
        term))
    
    # initialize the total number of images downloaded thus far
    total = 0


    for offset in range(0, estNumResults, GROUP_SIZE):
        # update the search parameters using the current offset, then
        # make the request to fetch the results
        print("[INFO] making request for group {}-{} of {}...".format(
            offset, offset + GROUP_SIZE, estNumResults))
        params["offset"] = offset
        search = requests.get(URL, headers=headers, params=params)
        search.raise_for_status()
        results = search.json()
        print("[INFO] saving images for group {}-{} of {}...".format(
            offset, offset + GROUP_SIZE, estNumResults))
            # loop over the results
        for v in results["value"]:
            # try to download the image
            try:
                # make a request to download the image
                print("[INFO] fetching: {}".format(v["contentUrl"]))
                r = requests.get(v["contentUrl"], timeout=30)
    
                # build the path to the output image
                ext = v["contentUrl"][v["contentUrl"].rfind("."):]
                p = os.path.sep.join([r"C:\Users\HP\Desktop\Projects\VIT Hack\SlideEZ-test", "{}{}".format(
				str(total).zfill(8), ext)])

                print("The answer is")
                print(p)
    
                # write the image to disk
                f = open(p, "wb")
                f.write(r.content)
                f.close()
    
            # catch any errors that would not unable us to download the
            # image
            except Exception as e:
                # check to see if our exception is in our list of
                # exceptions to check for
                if type(e) in EXCEPTIONS:
                    print("[INFO] skipping: {}".format(v["contentUrl"]))
                    continue
            # try to load the image from disk
            image = cv2.imread(p)

            # if the image is `None` then we could not properly load the
            # image from disk (so it should be ignored)
            if image is None:
                print("[INFO] deleting: {}".format(p))
                os.remove(p)
                continue

            # update the counter
            total += 1
    endtime=time.time()-starttime
    print("Total time taken to search for the query is")
    print(endtime)

    from pptx import Presentation
    from pptx.util import Inches, Pt 
    from pptx.enum.text import PP_ALIGN
    from PIL import Image
    from pptx.dml.color import RGBColor
    from pptx.enum.dml import MSO_THEME_COLOR

    presentation = "testppt3.pptx"
    prs = Presentation(presentation)
    if len(prs.slides)==0:
        title_slide_layout = prs.slide_layouts[0]
        slide = prs.slides.add_slide(title_slide_layout)
        background=slide.background
        fill=background.fill
        fill.gradient()
        fill.gradient_angle=40
        gradient_stops=fill.gradient_stops
        gradient_stop=gradient_stops[0]
        color=gradient_stop.color
        color.theme_color=MSO_THEME_COLOR.LIGHT_1
        title = slide.shapes.title
        subtitle = slide.placeholders[1]
        title.text = "Test"
        subtitle.text = "test"
        prs.save(presentation)
    if not flag:

        text_slide_layout = prs.slide_layouts[1]
        slide = prs.slides.add_slide(text_slide_layout)
        background=slide.background
        fill=background.fill
        fill.gradient()
        fill.gradient_angle=40
        gradient_stops=fill.gradient_stops
        gradient_stop=gradient_stops[0]
        color=gradient_stop.color
        color.theme_color=MSO_THEME_COLOR.LIGHT_1
        title = slide.shapes.title
        blist=[]
        for i in range(0,len(alist)):
            blist+=alist[i].split(" ")

        mx=0
        slide_t=""
        for j in blist:
            if(len(j)>=mx):
                mx=len(j)
                slide_t=j.title()
            
        title.text= slide_t
        content = slide.shapes.placeholders[1]
        tf = content.text_frame
        for i in alist:
            para=tf.add_paragraph()
            para.text=i
            para.level=1
        prs.save(presentation)
    else:

        image_slide_layout = prs.slide_layouts[8]
        slide = prs.slides.add_slide(image_slide_layout)
        background=slide.background
        fill=background.fill
        fill.gradient()
        fill.gradient_angle=40
        gradient_stops=fill.gradient_stops
        gradient_stop=gradient_stops[0]
        color=gradient_stop.color
        color.theme_color=MSO_THEME_COLOR.LIGHT_1
        #title = slide.shapes.title
        #title.text="Sub2"
        content = slide.shapes.placeholders[1]
        im=Image.open(p)
        width,height= im.size
        content.height= height
        content.width= width
        content.insert_picture(p)
        content = slide.shapes.placeholders[0]
        tf = content.text_frame
        for i in alist:
            
            para=tf.add_paragraph()
            para.text=i
            para.level=1
            para.alignment=PP_ALIGN.CENTER
        #left = Inches(6)
        #top = Inches(3)
        #height = Inches(2)
        #pic = slide.shapes.add_picture(p, left, top, height=height)
        prs.save(presentation)
예제 #25
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 13 11:26:00 2020

@author: barth
"""

import pandas as pd
import re
from deepsegment import DeepSegment
from tqdm import tqdm

updating = True

segmenter = DeepSegment('en')

df = pd.read_pickle('JREdataframeUPDATED.pkl', )


def getPodNum(vidtitle):
    num = vidtitle.split('#')[1].split()[0]
    num = num.replace('-', '')

    return int(num)


vidnums = [getPodNum(i) for i in df['Title']]

df['PodNum'] = vidnums

df = df.sort_values(['PodNum', 'Title']).reset_index(drop=True)
예제 #26
0
파일: backup_SBD.py 프로젝트: blesk011/BTS
    def window_segment(self, strings):
        segmenter = DeepSegment('en')
        sentences = segmenter.segment_long(strings)

        return sentences
예제 #27
0
 def init_segment(self):
     from deepsegment import DeepSegment
     self.segmenter = DeepSegment("en", tf_serving=False)
예제 #28
0
def get_segmenter():
    segmenter = DeepSegment('en')
    return segmenter
from deepsegment import DeepSegment

from ibm_watson import NaturalLanguageUnderstandingV1

from ibm_watson.natural_language_understanding_v1 \
    import Features, EntitiesOptions, KeywordsOptions

natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    iam_apikey='oRW-WiI73HQMQxq0mVZnPJzN3UFwX4-9oD-XpjLjqUNi',
    url=
    'https://gateway-wdc.watsonplatform.net/natural-language-understanding/api'
)

segmenter = DeepSegment('en')

pd.set_api_key("Mf5Rgw0kBSWSNThFQxKYbEQvPgKgrexUKqPEPDMwGkM")

tone_analyzer = ToneAnalyzerV3(
    version='2017-09-21',
    iam_apikey='E8dobLcUUvh7NZU6MzpFv-GDUiIuEmOV43vQIWSNO0tE',
    url='https://gateway-wdc.watsonplatform.net/tone-analyzer/api')

import sys
import nltk
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.stem.snowball import SnowballStemmer

from nltk.stem import WordNetLemmatizer
예제 #30
0
from deepsegment import DeepSegment
# The default language is 'en'
segmenter = DeepSegment('en')
print(segmenter.segment('I am Batman i live in gotham'))
print(segmenter.segment_long('I am Batman i live in gotham'))
# ['I am Batman', 'i live in gotham']