def fragments_by_delta(self, from_fragment): if len(from_fragment) < self.settings.MIN_FRAGMENT_LEN: raise IndexError( "Can't process fragment: MIN_FRAGMENT_LEN is greater than the fragment length" ) fragments_found = [] windows = self.extract_windows(from_fragment) fragment_start = None for i in range(len(windows)): windows[i].delta = windows[i].mean - windows[ i - 1].mean if i > 0 else windows[i].mean if windows[ i].delta > self.settings.DELTA_THRESHOLD and fragment_start is None: fragment_start = windows[i].start_frame - ( self.settings.WINDOW_SIZE // 2) elif windows[ i].delta > self.settings.DELTA_THRESHOLD and fragment_start is not None and windows[ i].mean > self.settings.SILENCE_THRESHOLD: if windows[ i].start_frame - fragment_start >= self.settings.MIN_FRAGMENT_LEN: fragments_found.append( Fragment(from_fragment, fragment_start, windows[i].start_frame)) fragment_start = windows[i].start_frame - ( self.settings.WINDOW_SIZE // 2) if fragment_start is not None: fragments_found.append( Fragment(from_fragment, fragment_start, from_fragment.window.end_frame)) return fragments_found, windows
def _get_data(self, label_identifier=None): """Original label_identifier is 'semeval2020', translate to 'semeval2021' If label_identifier is set, we translate here in _get_data, so that happens only when parsing the original dataset """ self.labels = read_labels_from_file(label_identifier) # File containing labelled data label_file = os.path.join(self.task_config['dir'], self.task_config['label_file']) with open(label_file, 'r') as f: lst = f.readlines() self.data = [] prev_article_id = -1 fragments = [] for l in lst: article_id, p_type, start, end = l.strip().split('\t') article_id = int(article_id) if article_id == prev_article_id: # FIXME: hardcoded exceptions for label_identifiers if label_identifier != 'semeval2020': for n_p_type in normalize_label(p_type): fragments.append(Fragment(int(start), int(end), n_p_type)) else: fragments.append(Fragment(int(start), int(end), p_type)) else: if prev_article_id != -1: # Add the previous article #print("APPENDING article ", prev_article_id) fragments.sort() # Sort on first tuple element, that is, 'start' self.data.append({ 'id': prev_article_id, 'article': self.read_article(prev_article_id), 'fragments': fragments }) # Prepare the new one prev_article_id = article_id fragments = [] if label_identifier != 'semeval2020': for n_p_type in normalize_label(p_type): fragments.append(Fragment(int(start), int(end), n_p_type)) else: fragments.append(Fragment(int(start), int(end), p_type)) if fragments != []: #print("Appending ", prev_article_id) self.data.append({ 'id': prev_article_id, 'article': self.read_article(prev_article_id), 'fragments': fragments }) print("Total fragments = ", sum([len(d['fragments']) for d in self.data])) return self.labels, self.data
def test_split_sentences_multi_until(): id = '111' start_text, start_fragments, continuation_text, rest_fragments, new_start = \ split_sentences_multi_until(article_2, fragments_2, start_index=0) assert start_text == 'aaaaa\n\nbbbbb\n' assert start_fragments == [Fragment(2, 6, 'foo'), Fragment(4, 8, 'bar')] assert continuation_text == 'ccccc\n\n01234' assert rest_fragments == [Fragment(8, 10, 'foo')] assert new_start == 13 assert article_2[new_start] == 'c' assert article_2[new_start - 1] == '\n'
def chopchop(): for songfile in utils.onlyfiles(RAW_MP3_DIR): song_path_in = os.path.join(RAW_MP3_DIR, songfile) song = AudioSegment.from_mp3(song_path_in) for i, c in enumerate(utils.chunks(song,segment_duration)): if len(c) < segment_duration: continue print(f"Processing {songfile} chunk {i}") fragment = Fragment(f"{songfile[:-4]}", i, mp3_seg = c ) fragment.mp3_to_np()
def decode(s: str, all_labels: list) -> tuple: """Convert a string with Begin and End markers into an article and a list of fragments.""" article = [] # article is built up character by character label_dict = {label_index: None for label_index in range(len(all_labels))} re_start = "^\[i-(\d+)\]\s?" re_end = "^\s?\[o-(\d+)\]" index = 0 fragments = [] errors = 0 # number of errors during processing while index < len(s): m = re.match(re_start, s[index:]) n = re.match(re_end, s[index:]) if m is not None: # Set start marker for the label to the current length of the article label_index = int(m.group(1)) if label_dict[label_index] is not None: print( f"Ignoring unclosed open marker to open new for '{all_labels[label_index]}'/{label_index} in\n(({s}))" ) errors += 1 label_dict[label_index] = len(article) index += len(m.group(0)) elif n is not None: label_index = int(n.group(1)) cur_start = label_dict[label_index] if cur_start is None: # We found an end marker for which there is no start marker print( f"Ignoring end marker without start marker for '{all_labels[label_index]}'/{label_index} in\n" f"(({s}))") errors += 1 else: cur_end = len(article) while cur_start < cur_end and article[cur_start] in [ ' ', '\n', '\t' ]: cur_start += 1 if cur_end > cur_start: f = Fragment(cur_start, cur_end, all_labels[label_index]) fragments.append(f) label_dict[label_index] = None index += len(n.group(0)) else: # Regular character. We can just add, but then we have to do bookkeeping article.append(s[index]) index += 1 # Check if there are loose ends, that is, open markers without closure for l_index in range(len(all_labels)): if label_dict[l_index] is not None: print( f"Ignoring unclosed marker for label '{all_labels[l_index]}'/{l_index} in\n(({s}))" ) errors += 1 return ''.join(article), fragments, errors
def __init__(self, settings): self.settings = settings self.src_fragment = Fragment(self.settings.AUDIO_FILE) # Convert ms back to # of frames considering the sample frequency self.settings.WINDOW_SIZE = int(self.settings.WINDOW_SIZE * (self.src_fragment.src_freq / 1000)) self.settings.MIN_SILENCE_LEN = int( self.settings.MIN_SILENCE_LEN * (self.src_fragment.src_freq / 1000)) self.settings.MIN_FRAGMENT_LEN = int( self.settings.MIN_FRAGMENT_LEN * (self.src_fragment.src_freq / 1000))
def loadJSON2021(path: str): """ :param path: Comma separated list of paths to JSON files :return: """ data = [] file_list = [x.strip() for x in path.split(',')] for file_name in file_list: with open(file_name, 'r', encoding='utf8') as f: json_data = json.load(f) for item in json_data: id = item['id'] txt = item['text'] fragments = [] if not 'image' in item.keys(): # Variation 1: labels = labels = item['labels'] # labels = [{start:, end:, technique:, text_fragment}, ...] # We skip that 'text_fragment' as it can be derived from text and start/end for frag in labels: if frag['text_fragment'].strip() == '': # Deal with a '\n' fragment by skipping it continue fragments.append(Fragment(frag['start'], frag['end'], frag['technique'])) else: # Variation 2: start = 0 end = len(txt) for l in item['labels']: fragments.append(Fragment(start, end, l)) # There is an image, but we ignore it image = item['image'] fragments.sort() data.append({'id': id, 'article': txt, 'fragments': fragments}) return data
def test_calibrate(): match_text = 'hello sad big world!' orig_text = 'goodbye sad big world!' fragment = Fragment(6, 13, 'foo') assert fragment.extract(match_text) == 'sad big' res = calibrate(fragment, match_text, orig_text, distance=3) print('new fragment = ', res) assert res == Fragment(8, 15, 'foo') match_text = ' hello sad big world!' orig_text = 'hello sad big world!' fragment = Fragment(1, 6, 'foo') assert fragment.extract(match_text) == 'hello' res = calibrate(fragment, match_text, orig_text, distance=3) print('new fragment = ', res) assert res == Fragment(0, 5, 'foo')
def fragments_by_silence(self, from_fragment, SILENCE_THRESHOLD=None): if len(from_fragment) < self.settings.MIN_SILENCE_LEN: raise IndexError( "Can't process fragment: MIN_SILENCE_LEN is greater than the fragment length" ) SILENCE_THRESHOLD = self.settings.SILENCE_THRESHOLD if SILENCE_THRESHOLD is None else SILENCE_THRESHOLD silences = [] windows = self.extract_windows(from_fragment, WINDOW_SIZE=1000) silence_start = None for i in range(len(windows)): if windows[i].mean <= SILENCE_THRESHOLD: if silence_start is None: silence_start = windows[i].start_frame elif silence_start is not None: if (windows[i].start_frame - silence_start) >= self.settings.MIN_SILENCE_LEN: silences.append( Window(silence_start, windows[i].start_frame)) silence_start = None # for i in range(len(from_fragment)): # i_offset = i + from_fragment.window.start_frame # if abs(from_fragment.src_audio_signal[i_offset]) <= SILENCE_THRESHOLD: # if silence_start is None: # silence_start = i_offset # elif silence_start is not None: # if (i_offset - silence_start) >= self.settings.MIN_SILENCE_LEN: # silences.append(Window(silence_start, i_offset)) # silence_start = None # Fragments start 3/4 prev silence and end 1/4 curr silence fragments_found = [ Fragment(from_fragment, silences[i - 1].end_frame - (len(silences[i - 1]) // 4), silences[i].start_frame + (len(silences[i]) // 4)) for i in range(1, len(silences)) ] return fragments_found, silences
def _get_data(self, label_identifier="semeval2021"): self.labels = read_labels_from_file(label_identifier) self.data = [] training_set_file = os.path.join(self.task_config['dir'], self.task_config['label_file']) file_list = [x.strip() for x in training_set_file.split(',')] for file_name in file_list: with open(file_name, 'r', encoding='utf8') as f: json_data = json.load(f) for item in json_data: id = item['id'] txt = item['text'] labels = item['labels'] # labels = [{start:, end:, technique:, text_fragment}, ...] # We skip that 'text_fragment' as it can be derived from text and start/end fragments = [] for frag in labels: if frag['text_fragment'].strip() == '': # Deal with a '\n' fragment by skipping it continue fragments.append(Fragment(frag['start'], frag['end'], frag['technique'])) fragments.sort() self.data.append({'id': id, 'article': txt, 'fragments': fragments}) return self.labels, self.data
def load_all_fragment(self): file_names_of_plates = os.listdir(const_values.FLAGS.dir_of_plates) file_names_of_fractures = os.listdir(const_values.FLAGS.dir_of_fractures) names_of_fragment = [] for name_of_plate in file_names_of_plates: if name_of_plate.endswith(self.prefix): file_names_of_fragments = os.listdir(const_values.FLAGS.dir_of_plates + name_of_plate) for name_of_fragment in file_names_of_fragments: if name_of_fragment.endswith('.stl'): names_of_fragment.append(name_of_plate + '/' + name_of_fragment) # print(names_of_fragment) names_of_fracture = [[] for _ in range(len(names_of_fragment))] for name_of_fracture in file_names_of_fractures: s = name_of_fracture.split('-') if s[0] == self.prefix: names_of_fracture[int(s[1]) - 1].append(name_of_fracture) # print(names_of_fracture) for i in range(len(names_of_fragment)): # print('fragment: ', const_values.FLAGS.dir_of_plates + names_of_fragment[i]) # print('fracture: ', const_values.FLAGS.dir_of_fractures + names_of_fracture[i][0]) fragment = self.read_stl(const_values.FLAGS.dir_of_plates + names_of_fragment[i]) fractures = [self.read_stl(const_values.FLAGS.dir_of_fractures + j) for j in names_of_fracture[i]] self.fragments.append(Fragment(fragment, fractures, self.prefix))
def test_split_sentences_multi(): id = '111' part = get_fragment(article_2, fragments_2[2]) assert part == '12' data = split_sentences_multi( id, article_2, fragments_2, include_empty=True) #, start_index=0, data=[]) print(data) assert data[0] == { 'id': '111x', 'article': 'aaaaa\n\nbbbbb\n', 'fragments': [Fragment(2, 6, 'foo'), Fragment(4, 8, 'bar')] } assert data[1] == { 'id': '111_0p0', 'article': 'ccccc\n\n', 'fragments': [] } #assert get_fragment(data[2]['article'], data[2]['fragments'][0]) == '12' assert data[2] == { 'id': '111_0x', 'article': '01234', 'fragments': [Fragment(1, 3, 'foo')] } # Add a fragment that starts after the previous fragments, # but starts before the end of the sentence included with the previous fragments. fragments_2.append(Fragment(9, 10, 'tolstoy')) fragments_2.sort() data = split_sentences_multi(id, article_2, fragments_2, include_empty=True) assert data[0] == { 'id': '111x', 'article': 'aaaaa\n\nbbbbb\n', 'fragments': [ Fragment(2, 6, 'foo'), Fragment(4, 8, 'bar'), Fragment(9, 10, 'tolstoy') ] } assert data[1] == { 'id': '111_0p0', 'article': 'ccccc\n\n', 'fragments': [] } assert get_fragment(data[2]['article'], data[2]['fragments'][0]) == '12' assert data[2] == { 'id': '111_0x', 'article': '01234', 'fragments': [Fragment(1, 3, 'foo')] }
import sys import pprint from fragment import Fragment # graph from start to stop # convert fragments from string to Counter fragments=['*'] + ['*A', '*AB', '*ABC', '*ABCD', '*CCDDD', '*ABCDE', '*AABEE', '*ACCDDD', '*AABCDE', '*AABCEE', '*ACCCDDD', '*AABCCDE', '*AABBCEE', '*AAAADDE', '*AABBCCDE' ] fragments=[Fragment(fragment) for index, fragment in enumerate(fragments)] #fragments=[Fragment(fragment) for index, fragment in enumerate(fragments) if index % 3 in {0, 1}] # Create graph graph=dict() for fragment in fragments: graph[fragment]=set() for fragment1 in fragments: for fragment2 in fragments: # make edge if fragment1 is a subset of fragment2 if fragment1 < fragment2: # determine difference between both fragments diff=fragment2.difference(fragment1) if Fragment('*') <= diff: diff = diff.difference(Fragment('*')) # if Fragment('o') <= diff:
def calibrate(fragment: Fragment, match_text: str, orig_text: str, distance=3) -> Fragment: """Calibrates a fragment to another text, assuming it to be nearly right. Effectively, it moves a fragment a short distance to better match the original text""" fragment_text = fragment.extract(match_text).lower() print(f'INPUT FRAGMENT = [{fragment_text}]') first_word_re = re.compile('\A\W*(\w+)\W.*', re.MULTILINE | re.DOTALL) # Include trailing end of line interpunction last_word_re = re.compile('.*\W(\w+[!.?]?)\W*\Z', re.MULTILINE | re.DOTALL) m = first_word_re.match(fragment_text + ' ') assert m is not None, f"First word matching failed for [{fragment_text} ] for {fragment}" first_word = m.group(1) # Deal with aberrant single letter words, skip them unless they are 'i' or 'a' # Commented out: this had a negative effect if len(first_word) == 1 and first_word not in ['i', 'a']: fragment.start += 1 fragment_text = fragment_text[1:] m = first_word_re.match(fragment_text + ' ') assert m is not None, f"First word matching failed (b) for [{fragment_text} ] for {fragment}" first_word = m.group(1) n = last_word_re.match(' ' + fragment_text) assert n is not None, f"Last word matching failed for [ {fragment_text}] for {fragment}" last_word = n.group(1) have_set_first_word = False have_set_last_word = False start, end = fragment.start, fragment.end startpos = max(start - distance, 0) for i in range(startpos, start + distance): if orig_text[i:].lower().startswith(first_word): start = i have_set_first_word = True break endpos = min(end + distance, len(orig_text)) for i in range(end - distance, endpos): if orig_text[:i].lower().endswith(last_word): end = i have_set_last_word = True break if not have_set_first_word: res = surrounding_word(orig_text, start) if res is None: print("starting in empty space") else: start = res[0] if not have_set_last_word: res = surrounding_word(orig_text, end, with_line_end=True) if res is None: print("ending in empty space") else: end = res[1] fragment.start = start fragment.end = end print(f'OUTPUT FRAGMENT = [{fragment.extract(orig_text)}]') return fragment
def compile(infix): """Return NFA fragment of the infix expression :param infix: regular expression :type infix: string :return: list """ # Convert infix to postfix postfix = shunt(infix) # Make postfix a stack postfix = list(postfix)[::-1] # Stac to keep track of fragments nfa_stack = [] while postfix: cChar = postfix.pop() if cChar == '.': # Concatenation # Pop two Fragments frag1, frag2 = nfa_stack.pop(), nfa_stack.pop() # Point frag2 accept state at frag1 start state frag2.accept.edges.append(frag1.start) start, accept = frag2.start, frag1.accept elif cChar == '|': # Alternation # Pop two Fragments frag1, frag2 = nfa_stack.pop(), nfa_stack.pop() # Create new start and accept states accept, start = State(), State(edges=[frag1.start, frag2.start]) # Point old accept state to new one frag2.accept.edges.append(accept) frag1.accept.edges.append(accept) elif cChar == '?': # Zero or One ''' One: accepts one character after the '?' Zero: no matches accept state is an arrow that points to nothing, so both accept. Similar to kleene star, just doesn't point back to itself after accepting a character. ''' # Pop one fragment frag = nfa_stack.pop() # Create new start and accept states accept, start = State(), State(edges=[frag.start, accept]) # Point old accept state to new accept state frag.accept.edges.append(accept) elif cChar == '+': # One or more ''' Accepts if there is one character and if more are read in, points back to itself (frag.start) ''' # Pop one fragment frag = nfa_stack.pop() # Create new start and accept states accept, start = State(), State(edges=[frag.start]) # Point old accept state at the new one frag.accept.edges = [frag.start, accept] elif cChar == '*': # Kleene Star (Zero or more) # Pop one fragment frag = nfa_stack.pop() # Create new start and accept states accept, start = State(), State(edges=[frag.start, accept]) # Point arrows frag.accept.edges.extend([frag.start, accept]) else: # Create new start and accept states accept = State() start = State(label=cChar, edges=[accept]) # New instance of fragment represents NFA newFrag = Fragment(start, accept) nfa_stack.append(newFrag) # The NFA stack should have exactly 1 NFA return nfa_stack.pop()
from fragment_utils import (preceding_non_labelled_sentences, get_fragment, next_nl_or_end, split_sentences_multi_until, split_sentences_multi, split_in_sentences, merge_short_sentences, calibrate, encode, decode, surrounding_word, next_token, tokenize_string, spacy_tokenize, insert_tags_list, insert_tags) from fragment import Fragment # Need some real data #from load_data import redux article = 'abcd\n1234\npqrs' article_extra = 'abcd\n\n1234\n\npqrs' f1 = Fragment(1, 2, 'f1') f2 = Fragment(6, 8, 'f2') f3 = Fragment(11, 13, 'f3') article_2 = 'aaaaa\n\nbbbbb\nccccc\n\n01234' fragments_2 = [ Fragment(2, 6, 'foo'), Fragment(4, 8, 'bar'), Fragment(21, 23, 'foo') ] def test_next_nl_or_end(): s = 'abcd\n\npqrs' n = next_nl_or_end(s) assert n == 6
def test_real_article(): fragments = [ Fragment(543, 683, 'Causal Oversimplification'), Fragment(582, 611, 'Loaded Language'), Fragment(835, 844, 'Loaded Language'), Fragment(1476, 1483, 'Loaded Language'), Fragment(1929, 2341, 'Appeal to authority'), Fragment(2045, 2051, 'Loaded Language'), Fragment(2984, 3018, 'Name calling/Labeling'), Fragment(3045, 3084, 'Loaded Language'), Fragment(3262, 3319, 'Exaggeration/Minimisation'), Fragment(3286, 3319, 'Bandwagon'), Fragment(3286, 3319, 'Reductio ad hitlerum'), Fragment(4433, 4498, 'Doubt'), Fragment(4580, 4590, 'Loaded Language'), Fragment(4583, 4590, 'Repetition'), Fragment(6524, 6539, 'Loaded Language'), Fragment(6813, 6839, 'Loaded Language'), Fragment(7009, 7036, 'Exaggeration/Minimisation'), Fragment(7134, 7152, 'Loaded Language') ] article_file_name = "propaganda_detection/datasets/train-articles/article111111135.txt" with open(article_file_name, 'r') as f: article = f.read() data = split_sentences_multi(id, article, fragments, include_empty=True) for f in fragments: print(f"{f.label} [{get_fragment(article, f)}]") assert len(data) == 47
def read_delete_insert_info(fname): logger.info('reading "{}"'.format(fname)) deletes = [] inserts = [] try: f = open(fname) for line in f: line = line.rstrip() m = named_node_delete_pat.search(line) if m: size = int(m.group('size')) name = m.group('name') if name and size > read_delete_insert_info_size_threshold: excluded = get_excluded(m.group('exc')) elems = Fragment(m.group('elems')) rest = m.group('rest') loc = m.group('loc') kind = m.group('kind') + '|' + rest gnid = int(m.group('gnid')) r = {'loc':loc,'size':size,'kind':kind,'name':name,'gnid':gnid,'excluded':excluded,'elems':elems} deletes.append(r) else: m = delete_pat.search(line) if m: size = int(m.group('size')) if size > read_delete_insert_info_size_threshold: kind = m.group('kind') loc = m.group('loc') gnid = int(m.group('gnid')) excluded = get_excluded(m.group('exc')) elems = Fragment(m.group('elems')) r = {'loc':loc,'size':size,'kind':kind,'name':None,'gnid':gnid,'excluded':excluded,'elems':elems} deletes.append(r) m = named_node_insert_pat.search(line) if m: size = int(m.group('size')) name = m.group('name') if name and size > read_delete_insert_info_size_threshold: excluded = get_excluded(m.group('exc')) elems = Fragment(m.group('elems')) rest = m.group('rest') loc = m.group('loc') kind = m.group('kind') + '|' + rest gnid = int(m.group('gnid')) r = {'loc':loc,'size':size,'kind':kind,'name':name,'gnid':gnid,'excluded':excluded,'elems':elems} inserts.append(r) else: m = insert_pat.search(line) if m: size = int(m.group('size')) if size > read_delete_insert_info_size_threshold: kind = m.group('kind') loc = m.group('loc') gnid = int(m.group('gnid')) excluded = get_excluded(m.group('exc')) elems = Fragment(m.group('elems')) r = {'loc':loc,'size':size,'kind':kind,'name':None,'gnid':gnid,'excluded':excluded,'elems':elems} inserts.append(r) f.close() except IOError as e: logger.warning(str(e)) return (deletes, inserts)
"samples/oval.wav") settings.SILENCE_THRESHOLD = 500 settings.DELTA_THRESHOLD = 200 # In ms, so it works with all framerates settings.WINDOW_SIZE = 200 settings.MIN_SILENCE_LEN = 300 settings.MIN_FRAGMENT_LEN = 400 main = SpeechLib(settings) main.settings.OUT_DIR = path.join(path.dirname(path.realpath(__file__)), "samples/out/%s/" % main.generate_runid()) print("Run ID: %s" % main.generate_runid()) # Clip min_fragment = Fragment(main.src_fragment, 0, (main.src_fragment.src_freq * 60)) t = microtime() silence_fragments, silences = main.fragments_by_silence(min_fragment) print("%i fragments by silence (%i ms)" % (len(silence_fragments), microtime() - t)) t = microtime() delta_fragments, windows = [], [] for f in silence_fragments: if len(f) >= main.settings.MIN_FRAGMENT_LEN: _delta_fragments, _windows = main.fragments_by_delta(f) delta_fragments += _delta_fragments windows += _windows print("%i fragments by delta (%i ms)" %