Exemplo n.º 1
0
    def fragments_by_delta(self, from_fragment):
        if len(from_fragment) < self.settings.MIN_FRAGMENT_LEN:
            raise IndexError(
                "Can't process fragment: MIN_FRAGMENT_LEN is greater than the fragment length"
            )

        fragments_found = []
        windows = self.extract_windows(from_fragment)

        fragment_start = None
        for i in range(len(windows)):
            windows[i].delta = windows[i].mean - windows[
                i - 1].mean if i > 0 else windows[i].mean

            if windows[
                    i].delta > self.settings.DELTA_THRESHOLD and fragment_start is None:
                fragment_start = windows[i].start_frame - (
                    self.settings.WINDOW_SIZE // 2)
            elif windows[
                    i].delta > self.settings.DELTA_THRESHOLD and fragment_start is not None and windows[
                        i].mean > self.settings.SILENCE_THRESHOLD:
                if windows[
                        i].start_frame - fragment_start >= self.settings.MIN_FRAGMENT_LEN:
                    fragments_found.append(
                        Fragment(from_fragment, fragment_start,
                                 windows[i].start_frame))
                    fragment_start = windows[i].start_frame - (
                        self.settings.WINDOW_SIZE // 2)

        if fragment_start is not None:
            fragments_found.append(
                Fragment(from_fragment, fragment_start,
                         from_fragment.window.end_frame))

        return fragments_found, windows
Exemplo n.º 2
0
    def _get_data(self, label_identifier=None):
        """Original label_identifier is 'semeval2020', translate to 'semeval2021'
        If label_identifier is set, we translate here in _get_data, so that happens
        only when parsing the original dataset
        """
        self.labels = read_labels_from_file(label_identifier)

        # File containing labelled data
        label_file = os.path.join(self.task_config['dir'], self.task_config['label_file'])
        with open(label_file, 'r') as f:
            lst = f.readlines()

        self.data = []
        prev_article_id = -1
        fragments = []
        for l in lst:
            article_id, p_type, start, end = l.strip().split('\t')

            article_id = int(article_id)
            if article_id == prev_article_id:
                # FIXME: hardcoded exceptions for label_identifiers
                if label_identifier != 'semeval2020':
                    for n_p_type in normalize_label(p_type):
                        fragments.append(Fragment(int(start), int(end), n_p_type))
                else:
                    fragments.append(Fragment(int(start), int(end), p_type))
            else:
                if prev_article_id != -1:
                    # Add the previous article
                    #print("APPENDING article ", prev_article_id)
                    fragments.sort()  # Sort on first tuple element, that is, 'start'
                    self.data.append({
                        'id': prev_article_id,
                        'article': self.read_article(prev_article_id),
                        'fragments': fragments
                    })

                # Prepare the new one
                prev_article_id = article_id
                fragments = []
                if label_identifier != 'semeval2020':
                    for n_p_type in normalize_label(p_type):
                        fragments.append(Fragment(int(start), int(end), n_p_type))
                else:
                    fragments.append(Fragment(int(start), int(end), p_type))

        if fragments != []:
            #print("Appending ", prev_article_id)
            self.data.append({
                'id': prev_article_id,
                'article': self.read_article(prev_article_id),
                'fragments': fragments
            })
        print("Total fragments = ", sum([len(d['fragments']) for d in self.data]))
        return self.labels, self.data
def test_split_sentences_multi_until():
    id = '111'
    start_text, start_fragments, continuation_text, rest_fragments, new_start = \
        split_sentences_multi_until(article_2, fragments_2, start_index=0)

    assert start_text == 'aaaaa\n\nbbbbb\n'
    assert start_fragments == [Fragment(2, 6, 'foo'), Fragment(4, 8, 'bar')]
    assert continuation_text == 'ccccc\n\n01234'
    assert rest_fragments == [Fragment(8, 10, 'foo')]
    assert new_start == 13
    assert article_2[new_start] == 'c'
    assert article_2[new_start - 1] == '\n'
Exemplo n.º 4
0
def chopchop():
    for songfile in utils.onlyfiles(RAW_MP3_DIR):
        song_path_in = os.path.join(RAW_MP3_DIR, songfile)
        song = AudioSegment.from_mp3(song_path_in)

        for i, c in enumerate(utils.chunks(song,segment_duration)):
            if len(c) < segment_duration:
                continue

            print(f"Processing {songfile} chunk {i}")
            fragment = Fragment(f"{songfile[:-4]}",
                                i,
                                mp3_seg = c
            )

            fragment.mp3_to_np()
def decode(s: str, all_labels: list) -> tuple:
    """Convert a string with Begin and End markers into an article and a list of fragments."""
    article = []  # article is built up character by character
    label_dict = {label_index: None for label_index in range(len(all_labels))}
    re_start = "^\[i-(\d+)\]\s?"
    re_end = "^\s?\[o-(\d+)\]"
    index = 0
    fragments = []
    errors = 0  # number of errors during processing
    while index < len(s):
        m = re.match(re_start, s[index:])
        n = re.match(re_end, s[index:])
        if m is not None:
            # Set start marker for the label to the current length of the article
            label_index = int(m.group(1))
            if label_dict[label_index] is not None:
                print(
                    f"Ignoring unclosed open marker to open new for '{all_labels[label_index]}'/{label_index} in\n(({s}))"
                )
                errors += 1
            label_dict[label_index] = len(article)
            index += len(m.group(0))
        elif n is not None:
            label_index = int(n.group(1))
            cur_start = label_dict[label_index]
            if cur_start is None:
                # We found an end marker for which there is no start marker
                print(
                    f"Ignoring end marker without start marker for '{all_labels[label_index]}'/{label_index} in\n"
                    f"(({s}))")
                errors += 1
            else:
                cur_end = len(article)
                while cur_start < cur_end and article[cur_start] in [
                        ' ', '\n', '\t'
                ]:
                    cur_start += 1
                if cur_end > cur_start:
                    f = Fragment(cur_start, cur_end, all_labels[label_index])
                    fragments.append(f)
                label_dict[label_index] = None
            index += len(n.group(0))
        else:
            # Regular character. We can just add, but then we have to do bookkeeping
            article.append(s[index])
            index += 1

    # Check if there are loose ends, that is, open markers without closure
    for l_index in range(len(all_labels)):
        if label_dict[l_index] is not None:
            print(
                f"Ignoring unclosed marker for label '{all_labels[l_index]}'/{l_index} in\n(({s}))"
            )
            errors += 1

    return ''.join(article), fragments, errors
Exemplo n.º 6
0
    def __init__(self, settings):
        self.settings = settings
        self.src_fragment = Fragment(self.settings.AUDIO_FILE)

        # Convert ms back to # of frames considering the sample frequency
        self.settings.WINDOW_SIZE = int(self.settings.WINDOW_SIZE *
                                        (self.src_fragment.src_freq / 1000))
        self.settings.MIN_SILENCE_LEN = int(
            self.settings.MIN_SILENCE_LEN *
            (self.src_fragment.src_freq / 1000))
        self.settings.MIN_FRAGMENT_LEN = int(
            self.settings.MIN_FRAGMENT_LEN *
            (self.src_fragment.src_freq / 1000))
Exemplo n.º 7
0
def loadJSON2021(path: str):
    """
    :param path: Comma separated list of paths to JSON files
    :return:
    """
    data = []
    file_list = [x.strip() for x in path.split(',')]
    for file_name in file_list:
        with open(file_name, 'r', encoding='utf8') as f:
            json_data = json.load(f)

        for item in json_data:
            id = item['id']
            txt = item['text']
            fragments = []

            if not 'image' in item.keys():
                # Variation 1: labels =
                labels = item['labels']  # labels = [{start:, end:, technique:, text_fragment}, ...]
                # We skip that 'text_fragment' as it can be derived from text and start/end
                for frag in labels:
                    if frag['text_fragment'].strip() == '':
                        # Deal with a '\n' fragment by skipping it
                        continue
                    fragments.append(Fragment(frag['start'], frag['end'], frag['technique']))
            else:
                # Variation 2:
                start = 0
                end = len(txt)
                for l in item['labels']:
                    fragments.append(Fragment(start, end, l))
                # There is an image, but we ignore it
                image = item['image']

            fragments.sort()
            data.append({'id': id, 'article': txt, 'fragments': fragments})
    return data
def test_calibrate():
    match_text = 'hello sad big world!'
    orig_text = 'goodbye sad big world!'
    fragment = Fragment(6, 13, 'foo')

    assert fragment.extract(match_text) == 'sad big'
    res = calibrate(fragment, match_text, orig_text, distance=3)
    print('new fragment = ', res)
    assert res == Fragment(8, 15, 'foo')

    match_text = ' hello sad big world!'
    orig_text = 'hello sad big world!'
    fragment = Fragment(1, 6, 'foo')

    assert fragment.extract(match_text) == 'hello'
    res = calibrate(fragment, match_text, orig_text, distance=3)
    print('new fragment = ', res)
    assert res == Fragment(0, 5, 'foo')
Exemplo n.º 9
0
    def fragments_by_silence(self, from_fragment, SILENCE_THRESHOLD=None):
        if len(from_fragment) < self.settings.MIN_SILENCE_LEN:
            raise IndexError(
                "Can't process fragment: MIN_SILENCE_LEN is greater than the fragment length"
            )

        SILENCE_THRESHOLD = self.settings.SILENCE_THRESHOLD if SILENCE_THRESHOLD is None else SILENCE_THRESHOLD

        silences = []
        windows = self.extract_windows(from_fragment, WINDOW_SIZE=1000)
        silence_start = None
        for i in range(len(windows)):
            if windows[i].mean <= SILENCE_THRESHOLD:
                if silence_start is None:
                    silence_start = windows[i].start_frame
            elif silence_start is not None:
                if (windows[i].start_frame -
                        silence_start) >= self.settings.MIN_SILENCE_LEN:
                    silences.append(
                        Window(silence_start, windows[i].start_frame))

                silence_start = None

        # for i in range(len(from_fragment)):
        #     i_offset = i + from_fragment.window.start_frame
        #     if abs(from_fragment.src_audio_signal[i_offset]) <= SILENCE_THRESHOLD:
        #         if silence_start is None:
        #             silence_start = i_offset
        #     elif silence_start is not None:
        #         if (i_offset - silence_start) >= self.settings.MIN_SILENCE_LEN:
        #             silences.append(Window(silence_start, i_offset))

        #         silence_start = None

        # Fragments start 3/4 prev silence and end 1/4 curr silence
        fragments_found = [
            Fragment(from_fragment,
                     silences[i - 1].end_frame - (len(silences[i - 1]) // 4),
                     silences[i].start_frame + (len(silences[i]) // 4))
            for i in range(1, len(silences))
        ]
        return fragments_found, silences
Exemplo n.º 10
0
    def _get_data(self, label_identifier="semeval2021"):
        self.labels = read_labels_from_file(label_identifier)
        self.data = []
        training_set_file = os.path.join(self.task_config['dir'], self.task_config['label_file'])
        file_list = [x.strip() for x in training_set_file.split(',')]
        for file_name in file_list:
            with open(file_name, 'r', encoding='utf8') as f:
                json_data = json.load(f)

            for item in json_data:
                id = item['id']
                txt = item['text']
                labels = item['labels']  # labels = [{start:, end:, technique:, text_fragment}, ...]
                # We skip that 'text_fragment' as it can be derived from text and start/end
                fragments = []
                for frag in labels:
                    if frag['text_fragment'].strip() == '':
                        # Deal with a '\n' fragment by skipping it
                        continue
                    fragments.append(Fragment(frag['start'], frag['end'], frag['technique']))
                fragments.sort()
                self.data.append({'id': id, 'article': txt, 'fragments': fragments})
        return self.labels, self.data
 def load_all_fragment(self):
     file_names_of_plates = os.listdir(const_values.FLAGS.dir_of_plates)
     file_names_of_fractures = os.listdir(const_values.FLAGS.dir_of_fractures)
     names_of_fragment = []
     
     for name_of_plate in file_names_of_plates:
         if name_of_plate.endswith(self.prefix):
             file_names_of_fragments = os.listdir(const_values.FLAGS.dir_of_plates + name_of_plate)
             for name_of_fragment in file_names_of_fragments:
                 if name_of_fragment.endswith('.stl'):
                     names_of_fragment.append(name_of_plate + '/' + name_of_fragment)
     # print(names_of_fragment)
     names_of_fracture = [[] for _ in range(len(names_of_fragment))]
     for name_of_fracture in file_names_of_fractures:
         s = name_of_fracture.split('-')
         if s[0] == self.prefix:
             names_of_fracture[int(s[1]) - 1].append(name_of_fracture)
     # print(names_of_fracture)
     for i in range(len(names_of_fragment)):
         # print('fragment: ', const_values.FLAGS.dir_of_plates + names_of_fragment[i])
         # print('fracture: ', const_values.FLAGS.dir_of_fractures + names_of_fracture[i][0])
         fragment = self.read_stl(const_values.FLAGS.dir_of_plates + names_of_fragment[i])
         fractures = [self.read_stl(const_values.FLAGS.dir_of_fractures + j) for j in names_of_fracture[i]]
         self.fragments.append(Fragment(fragment, fractures, self.prefix))
def test_split_sentences_multi():
    id = '111'
    part = get_fragment(article_2, fragments_2[2])
    assert part == '12'

    data = split_sentences_multi(
        id, article_2, fragments_2,
        include_empty=True)  #, start_index=0, data=[])

    print(data)

    assert data[0] == {
        'id': '111x',
        'article': 'aaaaa\n\nbbbbb\n',
        'fragments': [Fragment(2, 6, 'foo'),
                      Fragment(4, 8, 'bar')]
    }

    assert data[1] == {
        'id': '111_0p0',
        'article': 'ccccc\n\n',
        'fragments': []
    }

    #assert get_fragment(data[2]['article'], data[2]['fragments'][0]) == '12'
    assert data[2] == {
        'id': '111_0x',
        'article': '01234',
        'fragments': [Fragment(1, 3, 'foo')]
    }

    # Add a fragment that starts after the previous fragments,
    # but starts before the end of the sentence included with the previous fragments.
    fragments_2.append(Fragment(9, 10, 'tolstoy'))
    fragments_2.sort()

    data = split_sentences_multi(id,
                                 article_2,
                                 fragments_2,
                                 include_empty=True)
    assert data[0] == {
        'id':
        '111x',
        'article':
        'aaaaa\n\nbbbbb\n',
        'fragments': [
            Fragment(2, 6, 'foo'),
            Fragment(4, 8, 'bar'),
            Fragment(9, 10, 'tolstoy')
        ]
    }

    assert data[1] == {
        'id': '111_0p0',
        'article': 'ccccc\n\n',
        'fragments': []
    }

    assert get_fragment(data[2]['article'], data[2]['fragments'][0]) == '12'
    assert data[2] == {
        'id': '111_0x',
        'article': '01234',
        'fragments': [Fragment(1, 3, 'foo')]
    }
Exemplo n.º 13
0
import sys
import pprint
from fragment import Fragment
        
# graph from start to stop
# convert fragments from string to Counter

fragments=['*'] + ['*A', '*AB', '*ABC', '*ABCD', '*CCDDD', '*ABCDE', '*AABEE', '*ACCDDD', '*AABCDE', '*AABCEE', '*ACCCDDD', '*AABCCDE', '*AABBCEE', '*AAAADDE', '*AABBCCDE' ]
fragments=[Fragment(fragment) for index, fragment in enumerate(fragments)]
#fragments=[Fragment(fragment) for index, fragment in enumerate(fragments) if index % 3 in {0, 1}]
    
# Create graph

graph=dict()
for fragment in fragments:
    graph[fragment]=set()
    
for fragment1 in fragments:
    
    for fragment2 in fragments:
        
        # make edge if fragment1 is a subset of fragment2
        if fragment1 < fragment2:
            
            # determine difference between both fragments
            diff=fragment2.difference(fragment1)
            
            if Fragment('*') <= diff:
                diff = diff.difference(Fragment('*'))
                
#             if Fragment('o') <= diff:
Exemplo n.º 14
0
def calibrate(fragment: Fragment,
              match_text: str,
              orig_text: str,
              distance=3) -> Fragment:
    """Calibrates a fragment to another text, assuming it to be nearly right.

    Effectively, it moves a fragment a short distance to better match the original text"""
    fragment_text = fragment.extract(match_text).lower()

    print(f'INPUT FRAGMENT = [{fragment_text}]')

    first_word_re = re.compile('\A\W*(\w+)\W.*', re.MULTILINE | re.DOTALL)
    # Include trailing end of line interpunction
    last_word_re = re.compile('.*\W(\w+[!.?]?)\W*\Z', re.MULTILINE | re.DOTALL)

    m = first_word_re.match(fragment_text + ' ')
    assert m is not None, f"First word matching failed for [{fragment_text} ] for {fragment}"
    first_word = m.group(1)

    # Deal with aberrant single letter words, skip them unless they are 'i' or 'a'
    # Commented out: this had a negative effect
    if len(first_word) == 1 and first_word not in ['i', 'a']:
        fragment.start += 1
        fragment_text = fragment_text[1:]
        m = first_word_re.match(fragment_text + ' ')
        assert m is not None, f"First word matching failed (b) for [{fragment_text} ] for {fragment}"
        first_word = m.group(1)

    n = last_word_re.match(' ' + fragment_text)
    assert n is not None, f"Last word matching failed for [ {fragment_text}] for {fragment}"
    last_word = n.group(1)

    have_set_first_word = False
    have_set_last_word = False

    start, end = fragment.start, fragment.end
    startpos = max(start - distance, 0)
    for i in range(startpos, start + distance):
        if orig_text[i:].lower().startswith(first_word):
            start = i
            have_set_first_word = True
            break

    endpos = min(end + distance, len(orig_text))
    for i in range(end - distance, endpos):
        if orig_text[:i].lower().endswith(last_word):
            end = i
            have_set_last_word = True
            break

    if not have_set_first_word:
        res = surrounding_word(orig_text, start)
        if res is None:
            print("starting in empty space")
        else:
            start = res[0]
    if not have_set_last_word:
        res = surrounding_word(orig_text, end, with_line_end=True)
        if res is None:
            print("ending in empty space")
        else:
            end = res[1]

    fragment.start = start
    fragment.end = end

    print(f'OUTPUT FRAGMENT = [{fragment.extract(orig_text)}]')

    return fragment
Exemplo n.º 15
0
def compile(infix):
    """Return NFA fragment of the infix expression
    :param infix: regular expression
    :type infix: string
    :return: list
    """

    # Convert infix to postfix
    postfix = shunt(infix)
    # Make postfix a stack
    postfix = list(postfix)[::-1]

    # Stac to keep track of fragments
    nfa_stack = []

    while postfix:
        cChar = postfix.pop()

        if cChar == '.':
            # Concatenation
            # Pop two Fragments
            frag1, frag2 = nfa_stack.pop(), nfa_stack.pop()

            # Point frag2 accept state at frag1 start state
            frag2.accept.edges.append(frag1.start)

            start, accept = frag2.start, frag1.accept

        elif cChar == '|':
            # Alternation
            # Pop two Fragments
            frag1, frag2 = nfa_stack.pop(), nfa_stack.pop()

            # Create new start and accept states
            accept, start = State(), State(edges=[frag1.start, frag2.start])

            # Point old accept state to new one
            frag2.accept.edges.append(accept)
            frag1.accept.edges.append(accept)

        elif cChar == '?':
            # Zero or One
            '''
            One: accepts one character after the '?'
            Zero: no matches
            accept state is an arrow that points to nothing, so both accept.
            Similar to kleene star, just doesn't point back to itself after
            accepting a character.
            '''
            # Pop one fragment
            frag = nfa_stack.pop()

            # Create new start and accept states
            accept, start = State(), State(edges=[frag.start, accept])

            # Point old accept state to new accept state
            frag.accept.edges.append(accept)

        elif cChar == '+':
            # One or more
            '''
            Accepts if there is one character and if more are read in, 
            points back to itself (frag.start)
            '''
            # Pop one fragment
            frag = nfa_stack.pop()

            # Create new start and accept states
            accept, start = State(), State(edges=[frag.start])

            # Point old accept state at the new one
            frag.accept.edges = [frag.start, accept]

        elif cChar == '*':
            # Kleene Star (Zero or more)
            # Pop one fragment
            frag = nfa_stack.pop()

            # Create new start and accept states
            accept, start = State(), State(edges=[frag.start, accept])

            # Point arrows
            frag.accept.edges.extend([frag.start, accept])

        else:
            # Create new start and accept states
            accept = State()
            start = State(label=cChar, edges=[accept])

        # New instance of fragment represents NFA
        newFrag = Fragment(start, accept)

        nfa_stack.append(newFrag)

    # The NFA stack should have exactly 1 NFA
    return nfa_stack.pop()
from fragment_utils import (preceding_non_labelled_sentences, get_fragment,
                            next_nl_or_end, split_sentences_multi_until,
                            split_sentences_multi, split_in_sentences,
                            merge_short_sentences, calibrate, encode, decode,
                            surrounding_word, next_token, tokenize_string,
                            spacy_tokenize, insert_tags_list, insert_tags)

from fragment import Fragment

# Need some real data
#from load_data import redux

article = 'abcd\n1234\npqrs'
article_extra = 'abcd\n\n1234\n\npqrs'

f1 = Fragment(1, 2, 'f1')
f2 = Fragment(6, 8, 'f2')
f3 = Fragment(11, 13, 'f3')

article_2 = 'aaaaa\n\nbbbbb\nccccc\n\n01234'
fragments_2 = [
    Fragment(2, 6, 'foo'),
    Fragment(4, 8, 'bar'),
    Fragment(21, 23, 'foo')
]


def test_next_nl_or_end():
    s = 'abcd\n\npqrs'
    n = next_nl_or_end(s)
    assert n == 6
def test_real_article():

    fragments = [
        Fragment(543, 683, 'Causal Oversimplification'),
        Fragment(582, 611, 'Loaded Language'),
        Fragment(835, 844, 'Loaded Language'),
        Fragment(1476, 1483, 'Loaded Language'),
        Fragment(1929, 2341, 'Appeal to authority'),
        Fragment(2045, 2051, 'Loaded Language'),
        Fragment(2984, 3018, 'Name calling/Labeling'),
        Fragment(3045, 3084, 'Loaded Language'),
        Fragment(3262, 3319, 'Exaggeration/Minimisation'),
        Fragment(3286, 3319, 'Bandwagon'),
        Fragment(3286, 3319, 'Reductio ad hitlerum'),
        Fragment(4433, 4498, 'Doubt'),
        Fragment(4580, 4590, 'Loaded Language'),
        Fragment(4583, 4590, 'Repetition'),
        Fragment(6524, 6539, 'Loaded Language'),
        Fragment(6813, 6839, 'Loaded Language'),
        Fragment(7009, 7036, 'Exaggeration/Minimisation'),
        Fragment(7134, 7152, 'Loaded Language')
    ]

    article_file_name = "propaganda_detection/datasets/train-articles/article111111135.txt"

    with open(article_file_name, 'r') as f:
        article = f.read()

    data = split_sentences_multi(id, article, fragments, include_empty=True)
    for f in fragments:
        print(f"{f.label} [{get_fragment(article, f)}]")
    assert len(data) == 47
Exemplo n.º 18
0
def read_delete_insert_info(fname):
    logger.info('reading "{}"'.format(fname))
    deletes = []
    inserts = []
    try:
        f = open(fname)
        for line in f:
            line = line.rstrip()

            m = named_node_delete_pat.search(line)
            if m:
                size = int(m.group('size'))
                name = m.group('name')
                if name and size > read_delete_insert_info_size_threshold:
                    excluded = get_excluded(m.group('exc'))
                    elems = Fragment(m.group('elems'))
                    rest = m.group('rest')
                    loc = m.group('loc')
                    kind = m.group('kind') + '|' + rest
                    gnid = int(m.group('gnid'))
                    r = {'loc':loc,'size':size,'kind':kind,'name':name,'gnid':gnid,'excluded':excluded,'elems':elems}
                    deletes.append(r)
            else:
                m = delete_pat.search(line)
                if m:
                    size = int(m.group('size'))
                    if size > read_delete_insert_info_size_threshold:
                        kind = m.group('kind')
                        loc = m.group('loc')
                        gnid = int(m.group('gnid'))
                        excluded = get_excluded(m.group('exc'))
                        elems = Fragment(m.group('elems'))
                        r = {'loc':loc,'size':size,'kind':kind,'name':None,'gnid':gnid,'excluded':excluded,'elems':elems}
                        deletes.append(r)

            m = named_node_insert_pat.search(line)
            if m:
                size = int(m.group('size'))
                name = m.group('name')
                if name and size > read_delete_insert_info_size_threshold:
                    excluded = get_excluded(m.group('exc'))
                    elems = Fragment(m.group('elems'))
                    rest = m.group('rest')
                    loc = m.group('loc')
                    kind = m.group('kind') + '|' + rest
                    gnid = int(m.group('gnid'))
                    r = {'loc':loc,'size':size,'kind':kind,'name':name,'gnid':gnid,'excluded':excluded,'elems':elems}
                    inserts.append(r)
            else:
                m = insert_pat.search(line)
                if m:
                    size = int(m.group('size'))
                    if size > read_delete_insert_info_size_threshold:
                        kind = m.group('kind')
                        loc = m.group('loc')
                        gnid = int(m.group('gnid'))
                        excluded = get_excluded(m.group('exc'))
                        elems = Fragment(m.group('elems'))
                        r = {'loc':loc,'size':size,'kind':kind,'name':None,'gnid':gnid,'excluded':excluded,'elems':elems}
                        inserts.append(r)

        f.close()
    except IOError as e:
        logger.warning(str(e))

    return (deletes, inserts)
Exemplo n.º 19
0
                                "samples/oval.wav")
settings.SILENCE_THRESHOLD = 500
settings.DELTA_THRESHOLD = 200

# In ms, so it works with all framerates
settings.WINDOW_SIZE = 200
settings.MIN_SILENCE_LEN = 300
settings.MIN_FRAGMENT_LEN = 400

main = SpeechLib(settings)
main.settings.OUT_DIR = path.join(path.dirname(path.realpath(__file__)),
                                  "samples/out/%s/" % main.generate_runid())
print("Run ID: %s" % main.generate_runid())

# Clip
min_fragment = Fragment(main.src_fragment, 0,
                        (main.src_fragment.src_freq * 60))

t = microtime()
silence_fragments, silences = main.fragments_by_silence(min_fragment)
print("%i fragments by silence (%i ms)" %
      (len(silence_fragments), microtime() - t))

t = microtime()
delta_fragments, windows = [], []
for f in silence_fragments:
    if len(f) >= main.settings.MIN_FRAGMENT_LEN:
        _delta_fragments, _windows = main.fragments_by_delta(f)
        delta_fragments += _delta_fragments
        windows += _windows

print("%i fragments by delta (%i ms)" %