Exemplo n.º 1
0
def segment(data):
	segmenter = DeepSegment('en')
	seg_text = []
	for text in data:
	  segmenter.segment_long(text),
	  seg_text.extend(segmenter.segment_long(text))
	return seg_text
Exemplo n.º 2
0
class DeepSegmenter(BaseSegmenter):
    """
    Designed with ASR outputs in mind, DeepSegment uses BiLSTM +
    CRF for automatic sentence boundary detection.
    It outperforms the standard libraries (spacy, nltk, corenlp ..)
    on imperfect text, and performs similarly for perfectly punctuated text.

    Example: 'I am Batman i live in gotham'
            ->  # ['I am Batman', 'i live in gotham']

    Details: https://github.com/notAI-tech/deepsegment

    :param lang_code: en - english (Trained on data from various sources);
        fr - french (Only Tatoeba data); it - italian (Only Tatoeba data)
    :type lang_code: str
    :param checkpoint_name: Name to be used as checkpoint
    :type checkpoint_name: str
    :param args:  Additional positional arguments
    :param kwargs: Additional keyword arguments
    """
    def __init__(self,
                 lang_code: str = 'en',
                 checkpoint_name: str = None,
                 *args,
                 **kwargs):
        """Set constructor."""
        super().__init__(*args, **kwargs)
        self.lang_code = lang_code
        self.checkpoint_name = checkpoint_name

    def post_init(self):
        from deepsegment import DeepSegment
        self._segmenter = DeepSegment(self.lang_code,
                                      checkpoint_name=self.checkpoint_name)

    @single
    def segment(self, text: str, *args, **kwargs) -> List[Dict]:
        """
        Split the text into sentences.

        :param text: Raw text to be segmented
        :type text: str
        :param args:  Additional positional arguments
        :param kwargs: Additional keyword arguments
        :return: List of sub-docuemnt dicts with the cropped images
        :rtype: List[Dict]
        """

        results = []
        for idx, s in enumerate(self._segmenter.segment_long(text)):
            results.append(dict(text=s, offset=idx, weight=1.0))
        return results
Exemplo n.º 3
0
class DeepSegmenter(BaseSegmenter):
    """
    Designed with ASR outputs in mind, DeepSegment uses BiLSTM + CRF for automatic sentence boundary detection. It significantly outperforms the standard libraries (spacy, nltk, corenlp ..) on imperfect text and performs similarly for perfectly punctuated text.

    Example: 'I am Batman i live in gotham'
            ->  # ['I am Batman', 'i live in gotham']

    Details: https://github.com/notAI-tech/deepsegment
    """
    def __init__(self,
                 lang_code: str = 'en',
                 checkpoint_name: str = None,
                 *args,
                 **kwargs):
        """

        :param lang_code: en - english (Trained on data from various sources); fr - french (Only Tatoeba data); it - italian (Only Tatoeba data)
        :param args:
        :param kwargs:
        """
        super().__init__(*args, **kwargs)
        self.lang_code = lang_code
        self.checkpoint_name = checkpoint_name

    def post_init(self):
        from deepsegment import DeepSegment
        self._segmenter = DeepSegment(self.lang_code,
                                      checkpoint_name=self.checkpoint_name)

    def craft(self, text: str, doc_id: int, *args, **kwargs) -> List[Dict]:
        """
        Split the text into sentences.

        :param text: the raw text
        :param doc_id: the doc id
        :return: a list of chunk dicts with the cropped images
        """

        results = []
        for idx, s in enumerate(self._segmenter.segment_long(text)):
            results.append(dict(text=s, offset=idx, weight=1.0))
        return results
Exemplo n.º 4
0
def main():
    segmenter = DeepSegment('en')
    connect = pymysql.connect(**config)
    cursor = connect.cursor()
    cursor.execute('SELECT Id, Name, Caption FROM video')
    results = cursor.fetchall()
    new_results = []
    for result in tqdm(results):
        for playlist in playlist_list:
            if re.search(r'^' + playlist + '_\d+', result['Name']):
                new_result = {}
                new_result['caption'] = ', '.join(
                    segmenter.segment_long(result['Caption']))
                new_result['name'] = result['Name']
                new_result['id'] = result['Id']
                new_results.append(new_result)
    new_results = rectify(new_results)
    for new_result in tqdm(new_results):
        cursor.execute('UPDATE video SET Caption=%s WHERE Id=%s',
                       (new_result['caption'], new_result['id']))
    print(cursor.execute('SELECT * FROM video'))
    connect.commit()
    cursor.close()
    connect.close()
Exemplo n.º 5
0
class Decoder:
    def __init__(self, name: str, bit_rate: int, iteration: int = 1, max_active: int = 10000,
                 max_batch_size=50) -> None:
        super().__init__()
        self.name = name
        self.bit_rate = bit_rate
        self.segmenter = None
        self.use_feedback = False

        self.env = os.environ.copy()
        self.env["ITERATIONS"] = str(iteration)
        self.env["MAX_ACTIVE"] = str(max_active)
        self.env["MAX_BATCH_SIZE"] = str(max_batch_size)

        self.model_dir = os.path.join(
            "/workspace/nvidia-examples/", name.lower())
        self.result_dir = os.path.join("/tmp/results/", name.lower())
        self.prep_command = "prepare_data.sh"
        self.batch_feedback_command = "run_benchmark.sh"
        self.batch_command = "run_benchmark_org.sh"

        self.last_run = None
        # Decoding lock
        self.batch_lk = Lock()

        self.model_trainings = 0

    def initialize(self) -> None:
        prep_process = Popen(["/bin/bash", self.prep_command],
                             stdin=PIPE, stderr=PIPE, cwd=self.model_dir)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

    def init_segment(self):
        from deepsegment import DeepSegment
        self.segmenter = DeepSegment("en", tf_serving=False)

    def extract_corpora(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]:
        # noinspection PyBroadException
        try:
            trans_file = open(os.path.join(
                self.result_dir, str(batch_id), str(iter_id), "trans"))
            transcripts = trans_file.readlines()
            transcript_repo: Dict[str, List[str]] = {}
            for t in transcripts:
                spl = t.split(maxsplit=1)
                header = spl[0].split("_")[-1].split(".")[0]
                trans = spl[1]
                single_trans = open(os.path.join(self.result_dir, str(
                    batch_id), str(iter_id), "trans_" + header), "w")
                single_trans.write(trans)
                single_trans.close()

                archived = open(os.path.join("/root/audio/batch" +
                                             str(batch_id), "tran_" + header + ".txt"), "w")
                archived.write(trans)
                archived.close()

                transcript_repo[header] = trans.split()

            trans_int_file = open(os.path.join(self.result_dir, str(
                batch_id), str(iter_id), "trans_int_combined"))
            transcript_ints = trans_int_file.readlines()
            transcript_int_repo: Dict[str, List[int]] = {}
            for t in transcript_ints:
                spl = t.split(maxsplit=1)
                header = spl[0].split("_")[-1].split(".")[0]
                trans = spl[1]
                single_trans = open(os.path.join(self.result_dir, str(
                    batch_id), str(iter_id), "trans_int_combined_" + header), "w")
                single_trans.write(trans)
                single_trans.close()
                transcript_int_repo[header] = list(
                    map(lambda x: int(x), trans.split()))

            cmt_file = open(os.path.join(self.result_dir, str(
                batch_id), str(iter_id), "CTM.ctm"))
            convo = cmt_file.readlines()
            # noinspection PyTypeChecker
            extraction: Dict[str, TextIOWrapper] = {}
            convo_repo: Dict[str, List] = {}
            for c in convo:
                conv = c.split()[3:]
                conv = [float(conv[0]), int(conv[1])]

                meta = c.split()[0]
                header = meta.split(".", maxsplit=1)[0].split("_")[-1]
                if header in extraction:
                    extraction[header].write(str(conv[0]) + " " + str(conv[1]))
                    convo_repo[header].append(conv)
                else:
                    fd = os.path.join(self.result_dir, str(
                        batch_id), str(iter_id), header + ".ctm")
                    # noinspection PyTypeChecker
                    extraction[header] = open(fd, "w")
                    convo_repo[header] = []
            for k in extraction.keys():
                extraction[k].close()
        except:
            logging.error("Failed for batch ", batch_id)
            return {}

        batch_out = {}
        for key in transcript_repo.keys():
            # noinspection PyBroadException
            # try:
            transcript_tokens = transcript_repo[key]
            transcript = ""
            for tt in transcript_tokens:
                transcript += (tt + " ")
            alignment, duration = Decoder.calculate_alignment(
                transcript_repo[key], transcript_int_repo[key], convo_repo[key])

            logging.debug("Alignment complete for ", key)
            sentences = []

            # noinspection PyBroadException
            try:
                os.environ['CUDA_VISIBLE_DEVICES'] = '0'

                self.init_segment()
                sentences = self.segmenter.segment_long(transcript)

                use_lstm = True
            except Exception as e:
                logging.error(e)
                use_lstm = False
                tokens = transcript.split()
                for index in range(len(tokens)):
                    sentence_size = len(sentences)
                    if sentence_size < (int(index / 5) + 1):
                        sentences.append([])
                    word = tokens[int(index)]
                    sentences[int(index / 5)].append(word)

            w_dim = 0
            aligned_sentences = list()
            for s_raw in sentences:
                if use_lstm:
                    sentence = s_raw.split()
                else:
                    sentence = s_raw
                    sentence[-1] = sentence[-1] + "."

                aligned_sentence = list()
                for widx, word in enumerate(sentence):
                    w_dim += 0
                    Word(word, alignment[w_dim])
                    word_obj = Word(word, alignment[w_dim])
                    if widx == len(sentence) - 1:
                        word_obj.add_tag("is_punctuated", True)
                    aligned_sentence.append(word_obj)

                sentence_obj = Sentence(aligned_sentence, 0)
                aligned_sentences.append(sentence_obj)

                for idx, _ in enumerate(aligned_sentences):
                    if idx < (len(aligned_sentences) - 1):
                        aligned_sentences[idx].length = aligned_sentences[idx +
                                                                          1].words[0].timestamp
                aligned_sentences[(len(aligned_sentences) - 1)].length = duration

            transcript_out = {"duration": duration, "length": len(alignment), "sentences": aligned_sentences,
                              "complete": "1"}
            out_json = open(os.path.join(
                "/root/audio/batch" + str(batch_id), key + ".json"), "w")
            json.dump(transcript_out, out_json)
            out_json.close()
            batch_out[key] = transcript_out

        # Release GPU
        device = cuda.get_current_device()
        device.reset()
        return batch_out

    def decode_batch(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]:
        self.batch_lk.acquire(blocking=True)
        # set environment, start new shell
        batch_env = self.env
        batch_env["DATASET"] = os.path.join(
            "/root/audio/batch" + str(batch_id))

        # if self.use_feedback:
        prep_process = Popen(["/bin/bash", self.batch_feedback_command],
                             stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

        num_words = len(open(os.path.join("/tmp/results", self.name,
                                          str(batch_id), "0", "trans")).readlines()[0].split())
        # Fallback to original model if retrained model doesn't decode
        if num_words < 2:
            self.use_feedback = False
            shutil.rmtree(os.path.join(
                "/tmp/results", self.name, str(batch_id)))
            prep_process = Popen(["/bin/bash", self.batch_command],
                                 stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir)
            stdout, stderr = prep_process.communicate()
            logging.debug(stdout)
            logging.debug(stderr)

        # batch_env = self.env
        prep_process = Popen(["/usr/bin/gzip", "-d", os.path.join(
            self.result_dir, str(batch_id), str(iter_id), "lat_aligned.gz")], stdin=PIPE)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

        ctm_file = os.path.join(self.result_dir, str(
            batch_id), str(iter_id), "CTM.ctm")

        lattice_align_command: str = ""
        lattice_align_command += "/opt/kaldi/src/latbin/lattice-align-words-lexicon --partial-word-label=4324 " \
                                 "/workspace/models/aspire/data/lang_chain/phones/align_lexicon.int " \
                                 "/workspace/models/aspire/final.mdl"
        lattice_align_command += (" ark:" + os.path.join(self.result_dir,
                                                         str(batch_id), str(iter_id), "lat_aligned"))
        lattice_align_command += " ark:- | /opt/kaldi/src/latbin/lattice-1best ark:- ark:- | " \
                                 "/opt/kaldi/src/latbin/nbest-to-ctm ark:- "
        lattice_align_command += ctm_file
        prep_process = Popen(lattice_align_command, stdin=PIPE, shell=True)
        stdout, stderr = prep_process.communicate()
        logging.debug(stdout)
        logging.debug(stderr)

        corpora = self.extract_corpora(batch_id)

        if self.last_run is not None:
            self.last_run += 1
        else:
            self.last_run = 0

        self.batch_lk.release()
        return corpora

    def clear_results(self) -> None:
        if os.path.exists(self.result_dir):
            os.rmdir(self.result_dir)

    def train_model(self, fb: FeedbackAgent) -> None:
        fb.iter = self.model_trainings
        fb.lk = self.batch_lk
        fb.start()
        self.model_trainings += 1

    @staticmethod
    def calculate_alignment(words: List[str], idx: List[int], lats: List[List]) -> Tuple[List, float]:
        word_table: Dict[int, str] = dict()
        alignment = []

        len_words = len(words)
        len_idx = len(idx)
        len_lats = len(lats)
        assert len_words == len_idx
        if len_idx > len_lats:
            lats.insert(0, [0.0, idx[0]])
            if lats[0][0] == lats[1][0]:
                lats[1][0] = (lats[2][0] / 2)

        for i in range(len_idx):
            wt_idx = idx[i]
            word_table[wt_idx] = words[i]
        offset = 0.0
        for i in range(len_lats):
            original_lats = lats[i][0]
            if original_lats == 0.0:
                # noinspection PyBroadException
                try:
                    next_lat = lats[i + 1][0]
                    original_lats = next_lat / 2
                except:
                    original_lats = 0.05
            offset += original_lats
            lat_i = int(lats[i][1])
            w = word_table[lat_i]
            align = offset - lats[i][0]
            alignment.append([w, align])
        return alignment, offset

    @staticmethod
    def fetch_transcript(batch_id: int, corpus_id: str) -> object:
        out_json = open(os.path.join("/root/audio/batch" +
                                     str(batch_id), corpus_id + ".json"), "r")
        out_json = json.load(out_json)
        return out_json
Exemplo n.º 6
0
from deepsegment import DeepSegment
segmenter = DeepSegment(
    checkpoint_path=
    '/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/checkpoint',
    params_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/params',
    utils_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/utils')
# print(segmenter.segment('Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.'))
sent = 'Nhân viên kinh doanh và chăm sóc khách hàng 10/2015 - 04/2016 - Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.. -Hướng dẫn khách hàng về thủ tục hồ sơ để đăng ký chữ ký số và các phần mềm của công ty - Giải đáp thắc mắc của khách hàng và chuyển yêu cầu của khách hàng cho bộ phận liên quan'
print(segmenter.segment_long(sent))
Exemplo n.º 7
0
    def window_segment(self, strings):
        segmenter = DeepSegment('en')
        sentences = segmenter.segment_long(strings)

        return sentences
Exemplo n.º 8
0
from deepsegment import DeepSegment
# The default language is 'en'
segmenter = DeepSegment('en')
print(segmenter.segment('I am Batman i live in gotham'))
print(segmenter.segment_long('I am Batman i live in gotham'))
# ['I am Batman', 'i live in gotham']
Exemplo n.º 9
0
index = 0
for test_data in test_datas:
    results[str(index)] = {}
    index = index + 1

index = 0
for test_data in test_datas:
    print(test_data)

    for data_set in test_datasets:
        results[str(index)][data_set] = {}

    for data_set in test_datasets:
        print(data_set)
        f_checkpoint = data_set + "/checkpoint"
        f_params = data_set + "/params"
        f_utils = data_set + "/utils"
        segmenter = DeepSegment(lang_code=None,
                                checkpoint_path=f_checkpoint,
                                params_path=f_params,
                                utils_path=f_utils,
                                tf_serving=False,
                                checkpoint_name=None)
        res = segmenter.segment_long(test_data, n_window=i_window)
        results[str(index)][data_set] = res

    index = index + 1

################################################################################################################################################
print(results)