Пример #1
0
def worker_search_in_file(args):
    f, o = args
    tree = etree.parse(f)
    root = tree.getroot()
    speech_nodes = root.findall('.//speech')
    results = []
    for s in speech_nodes:
        speech_text = obctools.prepare_speech(s)
        a = {}
        a['Gender'] = s.get('sex', '')
        a['Trial'] = s.get('trial', '').replace("o", "t").replace("t-", "t")
        a['Year'] = s.get('year', '')
        if a['Year']=='':
            a['Year'] = a['Trial'][1:5]
        a['2-Periods'] = obctools.get_periods(a['Year'],2)
        a['3-Periods'] = obctools.get_periods(a['Year'],3)
        a['4-Periods'] = obctools.get_periods(a['Year'],4)
        a['5-Periods'] = obctools.get_periods(a['Year'],5)
        a['6-Periods'] = obctools.get_periods(a['Year'],6)
        a['Speaker role'] = s.get('role', '')
        a['HISCLASS'] = s.get('HISCLASS', '')
        a['2-Class'] = obctools.convert_hisclass_to_binclass(a['HISCLASS'])
        a['HISCO code'] = s.get('HISCO-code', '')
        a['HISCO label'] = s.get('HISCO-label', '')
        a['Speaker-ID'] = s.get('speaker', '').replace(
            " ", "").replace("-", "").strip()
        a['Printer'] = s.get('printer', '')
        a['Publisher'] = s.get('publisher', '')
        a['Scribe'] = s.get('scribe', '')
        a['Editor'] = s.get('editor', '')
        for m in o['regex'].finditer(speech_text):
            r = {}
            r.update(a)
            if o['show_pos'] == 1:
                r['Key'] = m.group(0)
                r['Left'] = speech_text[:m.start()][-o['context_left']:]
                r['Right'] = speech_text[m.end():][:o['context_right']]
            elif o['show_pos_key'] == 1:
                r['Key'] = m.group(0)
                r['Left'] = speech_text[:m.start()][-o['context_left']:]
                r['Right'] = speech_text[m.end():][:o['context_right']]
                r['Left'] = obctools.strip_pos_tags(r['Left'])
                r['Right'] = obctools.strip_pos_tags(r['Right'])
            else:
                r['Key'] = obctools.strip_pos_tags(m.group(0))
                r['Left'] = speech_text[:m.start()][-o['context_left']:]
                r['Right'] = speech_text[m.end():][:o['context_right']]
                r['Left'] = obctools.strip_pos_tags(r['Left'])
                r['Right'] = obctools.strip_pos_tags(r['Right'])
            r['Filename'] = os.path.basename(f)
            r['Search name'] = o['search_label']
            results.append(r)
    return results
Пример #2
0
    def finish_subcorpus(self):
        if self.subcorpus_results:
            pc = len(self.subcorpus_results)
            wc, uc = self.get_word_and_utterance_count(self.subcorpus_results)
            self.info_label["text"] = "{} utterances ({}) were selected" \
                                      " from {} proceedings.".format(uc, wc, pc)

            subcorpus_path = os.path.join(self.main.tool_path, "subcorpora")
            if obctools.make_dir(subcorpus_path):
                subcorpus_path = os.path.join(subcorpus_path, self.filename)
                if self.subcorpus_format.get() == ".txt":
                    with open(subcorpus_path, "w") as handler:
                        for r in self.subcorpus_results:
                            for sn in r['Nodes']:
                                s = obctools.prepare_speech(sn)
                                handler.write(s)
                                handler.write("\n\n")
                else:
                   
                    tree = etree.fromstring('<subcorpus filename="{0}"></subcorpus>'.format(self.filename))
                    xml = etree.ElementTree(tree)
                    root = xml.getroot()
                    root.set("utterances", str(uc))
                    root.set("words", str(wc))
                    root.set("proceedings", str(pc))
                    for r in self.subcorpus_results:
                        for sn in r['Nodes']:
                            sn.set("filename",r['Filename'])
                            root.append(sn)
                    with open(subcorpus_path,"wb") as handler:
                        try:
                            xml.write(handler, xml_declaration=True, encoding="utf-8")
                        except IOError as e:
                            print(e)
            self.info_label["text"] = "{:,} utterances ({:,} words) were selected" \
                                      " from {} proceedings" \
                                      " & saved as {}".format(uc, wc, len(self.subcorpus_results),
                                                              os.path.basename(self.filename))

            #self.info_label.config(text="Done. Subcorpus saved as {0}.".format(os.path.basename(self.filename)))

            self.subcorpus_results = None
            self.main.root.update_idletasks()
Пример #3
0
def worker_select_from_file(args):
    f, o = args
    tree = etree.parse(f)
    root = tree.getroot()
    r = {}
    r['Filename'] = os.path.basename(f)
    r['Nodes'] = []
    speech_nodes = root.findall('.//speech')
    results = []
    for s in speech_nodes:
        criteria = []
        speech_text = obctools.prepare_speech(s)
        a = {}
        a['Gender'] = s.get('sex', 'u')
        if a["Gender"] == "":
            a["Gender"] = "u"
        if len(o['Gender'])>0:
            if a['Gender'] in o['Gender']:
                criteria.append(True)
            else:
                criteria.append(False)
        if len(o['Speaker role'])>0:
            a['Speaker role'] = s.get('role', 'u').lower()
            if a["Speaker role"] == "":
                a["Speaker role"] = "u"
            if a["Speaker role"] in o["Speaker role"]:
                criteria.append(True)
            else:
                criteria.append(False)
        if len(o['HISCLASS'])>0:
            a['HISCLASS'] = s.get('HISCLASS', 'u')
            if a["HISCLASS"] == "":
                a["HISCLASS"] = "u"
            if a["HISCLASS"] in o["HISCLASS"]:
                criteria.append(True)
            else:
                criteria.append(False)
        if False not in criteria:
            r['Nodes'].append(s)
    results.append(r)

    return results, r['Filename']