예제 #1
0
def search_queries(I, queries, lines, output_file):

    # initialize the output doc creating the root
    attrs = OrderedDict()
    attrs['kwlist_filename'] = 'IARPA-babel202b-v1.0d_conv-dev.kwlist.xml'
    attrs['language'] = 'swahili'
    attrs['system_id'] = ''
    root = ET.Element('kwslist', attrs)

    # open query file and get all the hits (over all queries)
    doc = ET.parse(queries)
    kws = doc.getroot().findall('kw')

    # for each hit in the query file
    for kw in kws:
        # get id and text (split in words and save in a list q) of the query
        kwid = kw.get('kwid')
        q = re.split('\s+', kw.find('kwtext').text)
        # ensure all words in query are lowercase
        q = [q[i].lower() for i in range(len(q))]

        # if the first word is in the transcription, then search for the whole query
        if q[0] in I:
            root, detected_kwsl = kw_detected(root, kwid)
            # get info of current word
            qlen = len(q)

            # check all occurrences of the first word in the query
            for i in I[q[0]]:
                # check if query corresponds to current block in reference and time intervals are valid
                if match_query(lines, i, qlen, q) and valid_time_gap(
                        lines, i, qlen):
                    firstinfo = re.split('\s+', lines[i])
                    lastinfo = re.split('\s+', lines[i + qlen - 1])
                    durs = [
                        float(re.split('\s+', lines[x])[3])
                        for x in range(i, i + qlen)
                    ]
                    durtot = sum(durs)
                    scores = [
                        float(re.split('\s+', lines[x])[5])
                        for x in range(i, i + qlen)
                    ]
                    # multiply the score of the words in the query
                    finalscore = reduce(operator.mul, scores, 1)
                    info = OrderedDict()
                    info['file'] = firstinfo[0]
                    info['channel'] = firstinfo[1]
                    info['tbeg'] = firstinfo[2]
                    info['dur'] = str(round(durtot, 2))
                    info['score'] = str(finalscore)
                    info['decision'] = 'YES'
                    root, detected_kwsl = append_query_result(
                        root, detected_kwsl, info)

    outdoc = ET.ElementTree(root)
    return outdoc
예제 #2
0
if len(sys.argv)<3:
	raise RuntimeError,'Run script as:\n\tpython scoreNormalization.py path_to/input_file.xml path_to/output_file.xml [gamma]'

# gamma can be tuned
gamma = 1

input_file = sys.argv[1]
output_file = sys.argv[2]
if len(sys.argv)>3:
	gamma = float(sys.argv[3])


# sum over all hits of a query:
# open input file with original scores
doc = ET.parse(input_file)
detected_kwlists = doc.getroot().findall('detected_kwlist')
# for each query detected in the file
for dkw in detected_kwlists:
	# get all the hits and sum of all their scores
	kws = dkw.findall('kw')
	sum_scores = sum([pow(float(kw.get('score')),gamma) for kw in kws])
	# for each hit update the score by dividing by the sum of the scores
	for kw in kws:
		att['file'] = kw.attrib['file']
		att['channel'] = kw.attrib['channel']
		att['tbeg'] = kw.attrib['tbeg']
		att['dur'] = kw.attrib['dur']
		att['score'] = new_score
		new_score = str(pow(float(kw.attrib['score']),gamma)/sum_scores)
		att['decision'] = kw.attrib['decision']
예제 #3
0
            indent(elem, level + 1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i


''' ----------------- MAIN ----------------- '''

file1 = sys.argv[1]
file2 = sys.argv[2]
output_file = sys.argv[3]

# load file for system2
doc1 = ET.parse(file1)
detected_kwl_1 = doc1.getroot()

# load file for system1
tree2 = ET.parse(file2)
detected_kwl_2 = tree2.getroot()

# get all the queries in system2
queries_2 = detected_kwl_2.findall('detected_kwlist')
kwids = [kw.get('kwid') for kw in queries_2]

# for each query in system1 find the one with same kwid in system2
for query_1 in detected_kwl_1:

    kwid = query_1.get("kwid")
    query_2 = []
예제 #4
0
input_queries = sys.argv[1]
ref = sys.argv[2]
output_queries = sys.argv[3]
#print input_queries, ref, output_queries

# generate iv dictionary from the transcription ref
IV = iv_dict(ref)

# load the graphemic mapping and build the grapheme-confusion matrix CM
grph_map = 'lib/kws/grapheme.map'
with open(grph_map, 'r') as f:
    lines_map = f.readlines()
CM = generate_CM(lines_map)

# get all the hits of all the queries from the query file
doc = ET.parse(input_queries)
kws = doc.getroot().findall('kw')

# keep track of an OOV dictionary of the oov words you already encountered
# it will contain, for all the oov words, the closest iv word and the distance
OOV = {}

# for each query in the file
for kw in kws:
    kwtext = re.split('\s+', kw.find('kwtext').text)
    for i in range(len(kwtext)):
        w = kwtext[i]
        # check only the oov
        if w not in IV:
            # if w is already seen in this run we already have the info
            if w in OOV:
예제 #5
0
import myetree.ElementTree as ET
import re
import sys

query_file = 'lib/kws/queries.xml'
outmap_file = 'querylength.map'

doc = ET.parse(query_file)
kws = doc.getroot().findall('kw')

# build a dictionary to store the number (cont) of queries of length n:
#    counter[n] = cont
counter = {}

with open(outmap_file, 'w') as f:

    # for each query in the file
    for kw in kws:
        # get the query id: KW202-id
        idx = re.split('-', kw.get('kwid'))[-1]
        # load the list of words
        query = [x.lower() for x in re.split('\s+', kw.find('kwtext').text)]
        # evaluate the number of words
        n = len(query)
        if n not in counter:
            counter[n] = 0
        counter[n] += 1
        line = ' '.join([str(n), str(idx), str(counter[n])])
        f.write(line + '\n')

print 'counter', counter
예제 #6
0
if error:
	raise RuntimeError,'Run script as:\n\tpython morpoDecomposition.py path_to/input_file.{ctm, xml} path_to/dict_file.dct path_to/output_file.{ctm, xml}'


# read morpological file and build dictionary
d = open(dct)
dct_lines = d.readlines()
d.close()
D = make_dict(dct_lines)

extension = input_f.split('.')[-1]

if extension=='xml':
	# ---- file xml: it's the query.xml
	# open query file and get all hits for all queries
	doc = ET.parse(input_f)
	kws = doc.getroot().findall('kw')

	# for each hit in the file
	for kw in kws:
		kwtext = re.split('\s+', kw.find('kwtext').text)
		text = ''
		# split each word
		for w in kwtext:
			# get decomposition for w from morphological dictionary D
			decomposition = D[w]
			# update text of the tree node
			for s in decomposition:
				text += s+' '
		# remove (eventual) last space
		if text[-1]==' ':