class CustomMeterGraph(MeterGraph): def get_matra_count(x): y = 0 for c in x: if x =='=': y+=2 if x =='-': y+=1 return y def __init__(self, phrase='', count = None): ''' Count refers to matra count; can be an int or list of ints''' self.DG = self.create_graph() self.pp = GraphParser('settings/urdu-meter.yaml',blank=' ') # token parser self.lp = GraphParser('settings/long.yaml',blank='b') # long parser self.sp = GraphParser('settings/short.yaml',blank='b') # sort parser self.components = [] if count: assert type(count) in [list,int] if type(count)=='int': count = [count] for x in count: assert x>0 self.count = count if phrase!='': self.init_from_phrase(phrase) def transcription_of(self,inp, join_ch=''): ''' Provides transcription (into metrical units) of input ''' p = self.pp.parse(inp) if p.matches: return join_ch.join([x.production for x in p.matches]) def branch(self,syllables,ending=False, number_of_repeats=0, optional=False, weight=0, skip_if_matched=False): return Branch(syllables,ending,number_of_repeats,optional,weight,skip_if_matched) def create_graph(self): DG=nx.DiGraph() DG.add_node(0,type='0') # this is the start return DG # this is the graph scan. def add_graph_edge(self, curr_node_ids, new_node_id,optional=NOT_OPTIONAL,weight=0): DG = self.DG for curr_node_id in curr_node_ids: DG.add_edge(curr_node_id,new_node_id) curr_type = DG.node[curr_node_id]['type'] new_type = DG.node[new_node_id]['type'] edge = DG[curr_node_id][new_node_id] if (curr_type,new_type) in settings.bad_types: edge['bad_combos'] = settings.bad_types[(curr_type, new_type)] if optional != NOT_OPTIONAL: edge['optional'] = optional #allow preference for ignoring edge['weight'] = weight def end_nodes_of_component(self, component): component_type = type(component).__name__ assert component_type in ['Fork','MeterSegment'] if component_type=='Fork': end_nodes = [segment.end_node for segment in component.segments] else: end_nodes = [component.end_node] return end_nodes def add_fork(self,branches,optional=NOT_OPTIONAL, number_of_repeats=0): fork_number_of_repeats = number_of_repeats logging.debug('inside add_fork, number of repeats'+str(number_of_repeats)+str(branches)) #repeats inside fork') assert type(branches)==list for b in branches: assert type(b).__name__=='Branch' DG = self.DG if len(self.components)==0: prev_nodes=[0] # start from the beginning prev_optional = NOT_OPTIONAL else: # there are previous components prev_component = self.components[-1] prev_nodes = self.end_nodes_of_component(prev_component) prev_optional = prev_component.optional fork = Fork(segments=[],optional=optional) branch_starts = [] branch_ends = [] for branch in branches: curr_nodes = prev_nodes #return to original nodes start_node = len(DG.nodes()) branch_starts.append(start_node) last_node = start_node - 1 syllables = branch.syllables ending = branch.ending number_of_repeats = branch.number_of_repeats for i,s in enumerate(syllables): new_node = len(DG.nodes()) DG.add_node(new_node, type=s) if i==len(syllables)-1 and ending: DG.node[new_node]['ending'] = True self.add_graph_edge(curr_nodes,new_node) curr_nodes = [new_node] if i==0 and prev_optional!=NOT_OPTIONAL: #TODO:allow for multiple optionals last_optional = len(self.components)-2 optionals = [] l = last_optional optionals=[l] # while l>=0 and self.components[l].optional!=NOT_OPTIONAL: # optionals.append(l) # l=l-1 for o in optionals: #this might explode #TODO: THIS MAY BE BUGGY. assert o > -1 end_nodes = self.end_nodes_of_component(self.components[o]) self.add_graph_edge(end_nodes, start_node,optional=True)#self.components[o].optional) if i+1==len(syllables): branch_ends.append(new_node) if number_of_repeats>0: pass #DO NOT ALLOW REPEATS ON BRANCHES # if i+1 == (len(syllables)) and number_of_repeats > 0: # print i, curr_node, start_node # self.add_graph_edge(curr_nodes, start_node) m = MeterSegment(syllables=syllables, ending=ending,number_of_repeats=number_of_repeats,optional=optional,start_node=start_node, end_node=start_node+len(syllables)-1) fork.segments.append(m) if fork_number_of_repeats>0: logging.debug('repeats inside fork') for j in branch_starts: self.add_graph_edge(branch_ends,j) self.components.append(fork) def add_segment(self,syllables, ending=False, number_of_repeats=0,optional=NOT_OPTIONAL): # get ends of previous nodes DG = self.DG start_node = len(DG.nodes()) # where this segment will start last_node = start_node - 1 # this is the last node in the graph if len(self.components)==0: prev_nodes=[0] # start from the beginning prev_optional = NOT_OPTIONAL else: # there are previous components prev_component = self.components[-1] prev_nodes = self.end_nodes_of_component(prev_component) prev_optional = prev_component.optional curr_nodes = prev_nodes for i,s in enumerate(syllables): new_node = len(DG.nodes()) DG.add_node(new_node, type=s) if i==len(syllables)-1 and ending: DG.node[new_node]['ending'] = True self.add_graph_edge(curr_nodes,new_node) curr_nodes = [new_node] if i==0 and prev_optional!=NOT_OPTIONAL: #TODO:allow for multiple optionals last_optional = len(self.components)-2 l = last_optional optionals = [l] for o in optionals: #this might explode assert o >-2 if o == -1: end_nodes = [0] else: end_nodes = self.end_nodes_of_component(self.components[o]) self.add_graph_edge(end_nodes, start_node,optional=self.components[0].optional) if i+1 == (len(syllables)) and number_of_repeats > 0: self.add_graph_edge(curr_nodes, start_node) m = MeterSegment(syllables=syllables, ending=ending,number_of_repeats=number_of_repeats,optional=optional,start_node=start_node, end_node=start_node+len(syllables)-1) self.components.append(m) def graph_scan(self, in_string, parse='', ignore_skipping = False): #print 'in graph_scan' completed_scans = [] # holds complete scans if parse == '': parse = self.pp.parse(in_string) # holds output, matches scan_tokens = self.lp.tokenize(parse.output) else: scan_tokens = self.pp.tokenize(parse) logging.debug('parsed as %s',parse) # this generates scan_tokens from the scan of the input string, e.g. ['b','c','v'], using the long parser (lp) logging.debug('scan tokens %s',scan_tokens) # print 'scan_tokens',scan_tokens # this function descends into node (node_id), passing current token_i, matches, and a string represent DG = self.DG def descend_node(node_id, token_i, matches, matched_so_far): logging.debug('descending node_id'+str(node_id)) import operator successors = self.DG.successors(node_id) #edges([node_id]) newlist = sorted(successors, key=lambda k: self.DG[node_id][k]['weight']) successors=newlist#.sort(key=operator.itemgetter('weight')) for successor_id in successors: #print ignore_skipping if ignore_skipping==False and 'skip_if_matched' in self.DG[node_id][successor_id] and len(completed_scans)>0: logging.debug('********skipping!') continue node_type = self.DG.node[successor_id]['type'] assert node_type in ('=','-') if node_type=='=': parser = self.lp else: # print 'using sp' parser = self.sp if node_type=='-' and ignore_skipping==False and len(completed_scans)>0: #if len(self.lp.match_all_at(scan_tokens,token_i))>1: # Long matches possible, so moving along logging.debug('skipping wild shorts at node %d',successor_id) continue if 'optional' in self.DG[node_id][successor_id]:# check the edge if it's optional logging.debug('found an optional edge') for m in parser.match_all_at(scan_tokens, token_i): #print ' matched ', m.tokens, m.production # next, check to make sure that this is not a bad combination # do so by looking for constraints on the edge # note: this could be added as a constraint to match_all_at() as not_starting_with ... if len(matches)>0: # if already matched something # print 'already matched' a = matches[-1].found # details of previous match b = m.production#**['rule']['production'] # details of current match if 'bad_combos' in self.DG[node_id][successor_id]: # if if (a,b) in self.DG[node_id][successor_id]['bad_combos']: logging.debug('found bad combos %s',(a,b)) continue # abort! bad combination orig_tokens =[] for i in range(token_i, token_i+len(m.tokens)): orig_tokens +=parse.matches[i].tokens # generate node_ipa node_ipa = u'' for tkn in orig_tokens: if tkn in phonemes.phonemes: node_ipa +=phonemes.phonemes[tkn] else: print 'could not find token',tkn,'in ',phonemes.phonemes if node_ipa.endswith(u'ː̃'):#, node_ipa): # if nasal after long symbol, switch node_ipa = node_ipa[0:-2]+u'̃ː' if m.production.startswith('s_') and node_ipa.endswith(u'ː'): node_ipa = node_ipa[0:-1]+u'ˑ' # advance token index based on length of match tokens # generate match data matched_tokens = m.tokens match_data = NodeMatch(node_type=node_type, matched_tokens = matched_tokens, node_id=node_id, orig_tokens=orig_tokens, ipa = node_ipa, found=m.production, token_i=token_i) new_token_i = token_i + len(matched_tokens) so_far=matched_so_far + node_type curr_matches = list(matches) curr_matches.append(match_data) if new_token_i == len(scan_tokens): logging.debug('AT THE END') logging.debug(curr_matches) logging.debug('node is %d%s',successor_id,self.DG.node[successor_id]) if 'ending' in self.DG.node[successor_id]: logging.debug('AT THE END REALLY') count_okay = True if self.count: count=0 for x in so_far: if x=='=': count+=2 if x=='-': count+=1 count_okay = count in self.count if count_okay == True: completed_scans.append(ScanResult(scan=so_far, matches=curr_matches, meter_type='CUSTOM')) match_node = successor_id else: # count not okay pass else: pass # doesn't match and at end, so don't continu else: descend_node(successor_id, new_token_i,curr_matches,so_far) descend_node(0, 0, [], '') return completed_scans def init_from_phrase(self,phrase): self.initial_phrase = phrase self.parse_meter(phrase) def parse_meter(self,phrase): x = '(?:' x += '(?:' x += '(?P<required_group>\[.+?\])'+'|' x += '(?P<optional_group>\(.+?\))' x += ')' x += '(?P<repeated_group>\+)?' x += ')|' x += '(?P<regular>[=-]+)' my_re = re.compile(x) matches = [m for m in my_re.finditer(phrase)] endings = [False] * len(matches) optionals = [m.group('optional_group')!=None for m in matches]# in enumerate(matches) if m.group('optional_group')==None] ending_start = len(matches)-1 while optionals[ending_start]==True: ending_start-=1 for i in range(ending_start, len(endings)): endings[i]=True for i,m in enumerate(matches): if m.group('required_group') is not None or m.group('optional_group') is not None: optional_on = m.group('optional_group') is not None repeat_on = m.group('repeated_group') is not None if repeat_on: phrase = m.group(0)[1:-2] else: phrase = m.group(0)[1:-1] internal_groups = phrase.split('|') logging.debug('processing group '+phrase+' optional:'+str(optional_on)+' repeat:'+str(repeat_on)) # print 'optional = ',optional_on,'repeat',repeat_on if optional_on: optional_setting = OPTIONAL else: optional_setting = NOT_OPTIONAL if repeat_on: number_of_repeats = 3 else: number_of_repeats = 0 ending = endings[i] #.set_trace() if len(internal_groups)==1: self.add_segment(internal_groups[0], number_of_repeats=number_of_repeats, optional=optional_setting, ending=ending) elif len(internal_groups)>0: branches = [ self.branch(j,ending=ending, number_of_repeats=number_of_repeats, weight=w) for w,j in enumerate(internal_groups)] self.add_fork(branches, optional = optional_setting, number_of_repeats=number_of_repeats) else: self.add_segment(m.group(0), optional=NOT_OPTIONAL,ending=endings[i]) # print 'non-group found', m.group(0) def get_scan_as_string(self,x): z = self.graph_scan(x) if z: return z[0].scan # add space for spreadsheet viewing else: return '' def get_all_scans_as_string(self,x, ignore_skipping = True, separator = '\n'): z = self.graph_scan(x, ignore_skipping = ignore_skipping) if z: return separator.join([x.scan for x in z]) else: return '' #join(z[0].scan)
class MeterGraph: def __init__(self): self.DG = self.create_graph() self.pp = GraphParser('settings/urdu-meter.yaml',blank=' ') self.lp = GraphParser('settings/long.yaml',blank='b') self.sp = GraphParser('settings/short.yaml',blank='b') def create_graph(self): DG=nx.DiGraph() DG.add_node(0,type='0') # this is the start for meter in settings.meters_with_feet: meter_type = settings.meters_with_feet[meter] meter_full_description = meter meter = meter.replace('/','') # ignore feet denominator for now node_id = 0 curr_node = 0 for i,c in enumerate(meter): found_it = False for n in DG.successors(curr_node): node = DG.node[n] if node['type']==c: curr_node = n found_it = True break if found_it==False: new_node = len(DG.nodes()) DG.add_node(new_node, type=c) DG.add_edge(curr_node,new_node) # now add restraints to edge, this is a 'bad_combo' attribute # print DG.node[curr_node]['type'] old_c = DG.node[curr_node]['type'] if (old_c,c) in settings.bad_types: DG[curr_node][new_node]['bad_combos'] = settings.bad_types[(old_c, c)] # print 'yes found ',old_c,c, bad_types[(old_c,c)] curr_node = new_node if i==len(meter)-1: # store some metrical data at the last node DG.node[curr_node]['meter_type'] = meter_type DG.node[curr_node]['meter_full_description'] = meter_full_description return DG # this is the graph scan. def graph_scan(self, in_string): completed_scans = [] # holds complete scans #from collections import namedtuple # this is the scan of the input string to bcv, etc. # scan = pp.parse(in_string) parse = self.pp.parse(in_string) # holds output, matches #pd = pp.parse_details # details on the matched tokens, rules, etc. # this generates scan_tokens from the scan of the input string, e.g. ['b','c','v'], using the long parser (lp) scan_tokens = self.lp.tokenize(parse.output) # print scan_tokens # print 'scan_tokens are ',scan_tokens # This is a check to see that the short and long parsers match # TODO: remove later # import collections # assert collections.Counter(scan_tokens) == collections.Counter(lp.tokenize(scan)) # this function descends into node (node_id), passing current token_i, matches, and a string represent def descend_node(node_id, token_i, matches, matched_so_far): for successor_id in self.DG.successors(node_id): # print " in successor ",successor_id, DG.node[successor_id] # print scan_tokens # print "\nDESCENDED INTO node, ", node_id, DG.node[node_id], "token_i ",token_i # print " So far ",matched_so_far node_type = self.DG.node[successor_id]['type'] assert node_type in ('=','-') if node_type=='=': # print 'using lp' parser = self.lp else: # print 'using sp' parser = self.sp # print " TRYING ",node_type, " from node ", node_id # for each match m at token_i of scan_tokens # m contains ['tokens', 'start', 'rule_id', 'rule'] # m['rule'] contain ['tokens', 'production'] # TODO: declunkify. # print ".. here , at ", token_i," of ", scan_tokens, "parser finds: ",parser.match_all_at(scan_tokens, token_i) for m in parser.match_all_at(scan_tokens, token_i): # print ' matched ', m.tokens, m.production # next, check to make sure that this is not a bad combination # do so by looking for constraints on the edge # note: this could be added as a constraint to match_all_at() as not_starting_with ... if len(matches)>0: # if already matched something a = matches[-1].found # details of previous match b = m.production#**['rule']['production'] # details of current match if 'bad_combos' in self.DG[node_id][successor_id]: # if 'bad_combos' in the a,b's edge # print ' WARNING! BAD COMBO',a,b # print ' matches are ... ' # print matches # print 'checking bad combos at ',node_id, successor_id # print 'trying ',(a,b,'in ',self.DG[node_id][successor_id]['bad_combos'] if (a,b) in self.DG[node_id][successor_id]['bad_combos']: # if it's bad # print ' aborting! found ',a,b continue # abort! bad combination # determine orig_tokens # meaning, what is matched from original input and parsed to b,c,s, etc. orig_tokens =[] for i in range(token_i, token_i+len(m.tokens)):#['tokens'])): #parse = pp.parse(in_string) #print type(parse.matches[i].tokens) orig_tokens.append(parse.matches[i].tokens) # orig_tokens.append(pd[i]['rule']['tokens']) ## parser details here # this will break if 'rule' is None # except TypeError: # print 'error','i=',i # print 'pd[]i]',pd[i] # print 'error',m['tokens'], 'i',i # rule','tokens',"\n",m # advance token index based on length of match tokens # generate match data matched_tokens = m.tokens # print " accepting ",matched_tokens match_data = NodeMatch(node_type=node_type, matched_tokens = matched_tokens, node_id=node_id, orig_tokens=orig_tokens, found=m.production, token_i=token_i) # print matches # print match_data # print match_data # advance token_i new_token_i = token_i + len(matched_tokens) ## matches.append(match_data) so_far=matched_so_far + node_type #print ' so far',so_far # if we're at the end curr_matches = list(matches) #print "curr = ",matches curr_matches.append(match_data) #print "~~~~\n",curr_matches # print curr_matches if new_token_i == len(scan_tokens) and 'meter_type' in self.DG.node[successor_id]: #and 'meter_type' in DG.node[s]:# and len(DG.successors(s))==0: #print 'made it!', successor_id, DG.successors(successor_id),DG.node[successor_id], so_far completed_scans.append(ScanResult(scan=so_far, matches=curr_matches, meter_type=self.DG.node[successor_id]['meter_type'])) #,"matches"]) # used for completed scans # for x in matches: # print x match_node = successor_id #print DG.node(match_node,data=True)#[match_node] else: descend_node(successor_id, new_token_i,curr_matches,so_far) # start descent into node 0 of the graph, at token_i 0, with no matches descend_node(0, 0, [], '') return completed_scans def draw_graph(self): g = self.DG pos=nx.spring_layout(g) plt.figure(figsize=(15,15)) labels=dict((n,d['type']) for n,d in g.nodes(data=True)) # need to change labels for 0,1,etc. nx.draw(g,labels=labels,node_color='#A0CBE2',node_size=200)