示例#1
0
    def convert(self, id, result, format):
        assert format == "csv"

        _int = lambda x: None if x is None else int(x)
        naf = KafNafParser(BytesIO(result.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        s = StringIO()
        w = csv.writer(s)
        w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id",
                    "lemma", "pos", "pos1", "parent", "relation"])
        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                pos = term.get_pos()
                pos1 = POSMAP[pos]
                row = [id,  token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(),
                       tid, term.get_lemma(), pos, pos1]
                if tid in deps:
                    rel, parent = deps[tid]
                    row += [parent, rel.split("/")[-1]]
                else:
                    row += [None, None]
                w.writerow(row)
        return s.getvalue()
示例#2
0
文件: tokens.py 项目: aemal/amcat
    def from_naf(self, article, naf):
        def _int(x):
            return None if x is None else int(x)
        naf = KafNafParser(BytesIO(naf.encode("utf-8")))

        deps = {dep.get_to(): (dep.get_function(), dep.get_from())
                for dep in naf.get_dependencies()}
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                tid = term.get_id()
                tok = {"aid": article,
                       "token_id": token.get_id(),
                       "offset": _int(token.get_offset()),
                       "sentence": _int(token.get_sent()),
                       "para": _int(token.get_para()),
                       "word": token.get_text(),
                       "term_id": tid,
                       "lemma": term.get_lemma(),
                       "pos": term.get_pos()}
                if tid in deps:
                    rel, parent = deps[tid]
                    tok['parent'] = parent
                    tok['relation'] = rel.split("/")[-1]
                yield tok
示例#3
0
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """
    
    naf = KafNafParser(type="NAF")
    sent=1; offset=0
    input = [(u'dit', u'dit', u'O', u'VNW'),
             (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'),
             (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)
    
    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
    def process_single_file(self,file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print>>sys.stderr,'Error parsing',file,': skipped'
            return        


        print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None
                 
                 
               
         
        pos_for_wid = {} ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

            
        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()
                
            if value in self.punctuation:
                value = 'PUN'
                
            if value == '*':
                value = 'STAR'
            
            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid,value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)
        
        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0,('xxx','<S>'))
                sentence.append(('xxx','</S>'))
        
            for idx in range(0,len(sentence)):
                for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end])
                        file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
示例#5
0
def test_corenlp_naf():
    _check_corenlp()
    naf_bytes = corenlp.corenlp_naf("John shoots himself", annotators=corenlp.LEMMATIZER)
    print naf_bytes
    naf = KafNafParser(BytesIO(naf_bytes))

    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "shoot", "himself"})
示例#6
0
文件: test_frog.py 项目: amcat/nlpipe
def test_frog_saf():
    _check_frog()
    naf_str = frog._process("Mark Rutte werkte gisteren nog bij de  Vrije Universiteit in Amsterdam")

    naf = KafNafParser(BytesIO(naf_str))
    lemmata = {t.get_lemma() for t in naf.get_terms()}
    assert_equal(lemmata, {"Mark_Rutte", "werken", "gisteren", "nog", "bij",
                           "de", "vrij", "universiteit", "in", "Amsterdam"})
示例#7
0
def create_naf(text):
    naf = KafNafParser(type="NAF")
    naf.set_version("3.0")
    naf.set_language("nl")
    naf.lang = "nl"
    naf.raw = text
    naf.set_raw(naf.raw)
    return naf
示例#8
0
def test_corenlp_naf():
    _check_corenlp()
    naf_bytes = corenlp.corenlp_naf("John shoots himself",
                                    annotators=corenlp.LEMMATIZER)
    print naf_bytes
    naf = KafNafParser(BytesIO(naf_bytes))

    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "shoot", "himself"})
示例#9
0
def test_dump():
    """
    Can we use naf.dump() to stdout and file?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    token = naf.create_wf("\xd8lleg\xe5rd", 1, 1)
    expected = '<![CDATA[\xd8lleg\xe5rd]]></wf>'

    # do we get an error on dumping to stdout without redirect?
    naf.dump()

    # Can we dump to stdout?
    with capture_stdout() as s:
        naf.dump()
    output = s.getvalue().decode("utf-8")
    assert_in(expected, output)

    # Can we dump to a named file?
    f = tempfile.NamedTemporaryFile(suffix=".xml", delete=False)
    try:
        naf.dump(f.name)
        f.close()
        output = open(f.name, mode='rb').read().decode('utf-8')
    finally:
        os.remove(f.name)
    assert_in(expected, output)
示例#10
0
def get_sentence(naf: KafNafParser, term: Cterm) -> int:
    tokens = [
        naf.get_token(tid)
        for tid in naf.get_dict_tokens_for_termid(term.get_id())
    ]
    sent = {t.get_sent() for t in tokens}
    if len(sent) != 1:
        raise Exception(
            f"Term {term.get_id}:{term.get_lemma()} did not map to single sentence: {sent}"
        )
    return sent.pop()
示例#11
0
文件: test_frog.py 项目: amcat/nlpipe
def test_frog_saf():
    _check_frog()
    naf_str = frog._process(
        "Mark Rutte werkte gisteren nog bij de  Vrije Universiteit in Amsterdam"
    )

    naf = KafNafParser(BytesIO(naf_str))
    lemmata = {t.get_lemma() for t in naf.get_terms()}
    assert_equal(
        lemmata, {
            "Mark_Rutte", "werken", "gisteren", "nog", "bij", "de", "vrij",
            "universiteit", "in", "Amsterdam"
        })
def main(argv):
  conversion = ""
  try:
    opts, args = getopt.getopt(argv,"hkn",["tokaf","tonaf"])
  except getopt.GetoptError:
    print 'could not parse options. Correct usage: \n\n kaf-naf-parser.py --tokaf --tonaf'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'test.py --tokaf --tonaf'
      sys.exit()
    elif opt in ("-k", "--tokaf"):
      conversion = "to-kaf"
    elif opt in ("-n", "--tonaf"):
      conversion = "to-naf"

  if conversion == "":
    conversion = "kaf-naf"

  obj = KafNafParser(sys.stdin)

  if conversion == "to-kaf":
    obj.to_kaf()
  if conversion == "to-naf":
    obj.to_naf()

  obj.dump()
示例#13
0
def test_dump():
    """
    Can we use naf.dump() to stdout and file?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    token = naf.create_wf("\xd8lleg\xe5rd", 1, 1)
    expected = '<![CDATA[\xd8lleg\xe5rd]]></wf>'

    # do we get an error on dumping to stdout without redirect?
    naf.dump()
    
    # Can we dump to stdout?
    with capture_stdout() as s:
        naf.dump()
    output = s.getvalue().decode("utf-8")
    assert_in(expected, output)
    
    # Can we dump to a named file?
    f = tempfile.NamedTemporaryFile(suffix=".xml", delete=False)
    try:
        naf.dump(f.name)
        f.close()
        output = open(f.name, mode='rb').read().decode('utf-8')
    finally:
        os.remove(f.name)
    assert_in(expected, output)
示例#14
0
def run_and_compare(in_filename, out_filename, correct_out_filename,
                    use_subprocess=True, **kwargs):
    """
    Runs the system with `in_filename` as input and `out_filename` as output
    and then compares the result to `correct_out_filename`.

    Because some header data changes (as it should), the contents of
    `correct_out_filename` will be formatted using a call to `str.format` with
    the following keyword arguments:

        - version
        - timestamp
        - beginTimestamp
        - endTimestamp
        - hostname

    """
    with open(in_filename) as fd, open(out_filename, 'wb') as out:
        if use_subprocess:
            run_with_subprocess(fd, out, **kwargs)
        else:
            run_without_subprocess(fd, out, **kwargs)

    with open(out_filename) as out, open(correct_out_filename) as correct:
        # Check something happened and that the result can be parsed
        outnaf = KafNafParser(out_filename)

        # Get the header information to be able to compare raw files
        our_header_layer = list(
            outnaf.get_linguisticProcessors()
        )[-1]
        assert our_header_layer.get_layer() == 'coreferences'

        processors = list(
            our_header_layer.get_linguistic_processors()
        )
        assert len(processors) == 1

        our_header_data = processors[0]

        correct = correct.read().format(
            version=our_header_data.get_version(),
            timestamp=our_header_data.get_timestamp(),
            beginTimestamp=our_header_data.get_beginTimestamp(),
            endTimestamp=our_header_data.get_endTimestamp(),
            hostname=our_header_data.get_hostname(),
        )
        assert correct == out.read()
示例#15
0
 def alpino(cls, data: bytes) -> bytes:
     data = BytesIO(data)
     try:
         data = KafNafParser(data)
     except XMLSyntaxError:
         pass  # alpino can parse raw text
     return dump_naf(alpinonaf.parse(data))
示例#16
0
文件: nlpipe.py 项目: BBie/amcat
    def from_naf(self, naf):
        naf = KafNafParser(BytesIO(naf.encode("utf-8")))
        tokendict = {token.get_id(): token for token in naf.get_tokens()}

        for term in naf.get_terms():
            tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
            for token in tokens:
                yield {"aid": article.pk,
                       "token_id": token.get_id(),
                       "offset": token.get_offset(),
                       "sentence": token.get_sent(),
                       "para": token.get_para(),
                       "word": token.get_text(),
                       "term_id": term.get_id(),
                       "lemma": term.get_lemma(),
                       "pos": term.get_pos()}
示例#17
0
def read_training_data(file_name):
    """
    read kaf/naf and matches the aspects with the words
    """
    parser = KafNafParser(PATH_ANNOTATED_DATA + file_name)
    terms = list(parser.get_terms())
    #    create token dictionairy containing naf info
    tokens_container = dict()
    for token_el in parser.get_tokens():
        token_node = token_el.node
        token_id = token_node.get('wid').replace('w', 't')
        token_info = token_node.attrib
        tokens_container[token_id] = token_info
    properties = list(parser.get_properties())
    handled_properties, term_dict = handle_properties(properties, terms,
                                                      tokens_container)
    return terms, properties, handled_properties, term_dict, tokens_container
示例#18
0
 def convert(self, id, result, format):
     assert format == "csv"
     naf = KafNafParser(BytesIO(result.encode("utf-8")))
     memo = self._csv_memo(naf)
     tokendict = {token.get_id(): token for token in naf.get_tokens()}
     s = StringIO()
     w = csv.writer(s)
     w.writerow(self._csv_header())
     for term in naf.get_terms():
         tokens = [tokendict[id] for id in term.get_span().get_span_ids()]
         for token in tokens:
             tid = term.get_id()
             pos = term.get_pos()
             pos1 = POSMAP[pos]
             row = [id] + list(self._csv_row(memo, term, token))
             w.writerow(row)
     return s.getvalue()
def map_opinion_labels(input_file, output_file, config_file):
    # Load the mapping from the config_file
    mapping = {}
    parser = ConfigParser.ConfigParser()
    parser.read(config_file)
    for mapped_opinion, values_in_corpus in parser.items('valid_opinions'):
        values = [v for v in values_in_corpus.split(';') if v != '']
        for v in values:
            mapping[v] = mapped_opinion
    del parser
    ##################

    input_kaf = KafNafParser(input_file)
    remove_these = []
    for opinion in input_kaf.get_opinions():
        exp = opinion.get_expression()
        polarity = exp.get_polarity()
        if polarity in mapping:
            mapped_polarity = mapping[polarity]
        else:
            opi_id = opinion.get_id()
            remove_these.append(opi_id)
            mapped_polarity = polarity

        exp.set_polarity(mapped_polarity)

    for opi_id in remove_these:
        input_kaf.remove_this_opinion(opi_id)
    input_kaf.dump(output_file)
def map_opinion_labels(input_file,output_file,config_file):
    # Load the mapping from the config_file
    mapping = {}
    parser = ConfigParser.ConfigParser()
    parser.read(config_file)
    for mapped_opinion, values_in_corpus in parser.items('valid_opinions'):
        values = [ v for v in values_in_corpus.split(';') if v != '']
        for v in values:
            mapping[v] = mapped_opinion
    del parser
    ##################        
    
    input_kaf = KafNafParser(input_file)
    remove_these = []
    for opinion in input_kaf.get_opinions():
        exp = opinion.get_expression()
        polarity = exp.get_polarity()
        if polarity in mapping:
            mapped_polarity = mapping[polarity]
        else:
            opi_id = opinion.get_id()
            remove_these.append(opi_id)
            mapped_polarity = polarity
            
        exp.set_polarity(mapped_polarity)
        
    for opi_id in remove_these:
        input_kaf.remove_this_opinion(opi_id)
    input_kaf.dump(output_file)
def test_create_terms():
    """
    Can we create_terms via the create_{term,token} functions?
    """

    naf = KafNafParser(type="NAF")
    sent = 1
    offset = 0
    input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'),
             (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')]

    offset = 0
    for (word, lemma, pos, morph) in input:
        token = naf.create_wf(word, 1, offset)
        offset += len(word)
        term = naf.create_term(lemma, pos, morph, [token])

    tokens = {t.get_id(): t for t in naf.get_tokens()}
    assert_equal(len(tokens), 4)

    result = {}
    for term in naf.get_terms():
        for token_id in term.get_span().get_span_ids():
            token = tokens[token_id]
            result[term.get_id()] = (token.get_text(), term.get_lemma(),
                                     term.get_pos(), term.get_morphofeat())
    result = [result[tid] for tid in sorted(result.keys())]
    assert_equal(input, result)
示例#22
0
def extract_data_file(filename, label_gold, label_system, this_temp_folder=None, get_random=False):
    if this_temp_folder is None:
        temp_folder = mkdtemp()
    else:
        temp_folder = this_temp_folder
        
    fd_gold = open(temp_folder+'/'+__gold_filename__,'a')
    fd_system = open(temp_folder+'/'+__system_filename__, 'a')
    
    input_obj = KafNafParser(filename)
    for term in input_obj.get_terms():
        #Get gold
        term_id = term.get_id()
        results_gold = []
        results_system = []
        for ext_ref in term.get_external_references():
            resource = ext_ref.get_resource()
            if resource == label_gold:
                results_gold.append((ext_ref.get_reference(),ext_ref.get_confidence()))
            elif resource == label_system:
                results_system.append((ext_ref.get_reference(),ext_ref.get_confidence()))
        
        if len(results_gold) > 0:
            best_gold_label, best_gold_value = get_max_from_list(results_gold)
            fd_gold.write(filename+'\t'+term_id+'\t'+best_gold_label+'\n')
            
            if get_random:
                 best_system_label, best_system_value = get_random_from_list(results_system)
            else:
                best_system_label, best_system_value = get_max_from_list(results_system)
                
            if best_system_label is not None:
                fd_system.write(filename+'\t'+term_id+'\t'+best_system_label+'\n')
    fd_gold.close()
    fd_system.close()
    
    #Create the "fake" sense.mappings
    fd_map = open(temp_folder+'/'+__sense_mapping__,'w')
    fd_map.close()
    return temp_folder
示例#23
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
            for d in naf.get_dependencies()}
    expected = {'I': ('nsubj', 'hit'),
                'John': ('nsubj', 'attack'),
                'London': ('prep_in', 'attack'),
                'back': ('advmod', 'hit'),
                'he': ('dobj', 'hit')}
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
示例#24
0
def test_header():
    """
    Do the functions to set header attributes work correctly?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    naf.header = CHeader(type=naf.type)
    naf.root.insert(0, naf.header.get_node())

    naf.header.set_uri("http://example.com")
    assert_equal("http://example.com", naf.header.get_uri())
    naf.header.set_publicId("123")
    assert_equal("123", naf.header.get_publicId())

    # test if properties are serialized/deserialized correctly
    b = BytesIO()
    naf.dump(b)
    b.seek(0)
    naf2 = KafNafParser(b, type="NAF")
    assert_equal("http://example.com", naf2.header.get_uri())
    assert_equal("123", naf2.header.get_publicId())
示例#25
0
    def single_main(
        cls,
        output_file,
        naf_file,
        naf_extension=c.NAF_EXTENSION,
        validate=c.VALIDATE,
        uniqueyfy=c.UNIQUEYFY,
        fill_non_consecutive_coref_spans=c.FILL_NON_CONSECUTIVE_COREF_SPANS,
        sentence_filter=c.SENTENCE_DEFAULT_FILTER,
        conll_columns=c.CONLL_COLUMNS,
        conll_defaults=c.CONLL_DEFAULTS,
        min_column_spacing=c.MIN_COLUMN_SPACING,
        on_missing=c.CONLL_ON_MISSING,
    ):
        # Read document ID
        document_id = document_ID_from_filename(naf_file, naf_extension)
        cls.check_document_id(document_id, naf_file, on_missing['document_id'])

        # Read data
        reader = NAFReader(validate=validate)
        nafobj = KafNafParser(naf_file)
        sentences = reader.extract_sentences(nafobj)
        coref_sets = reader.extract_coref_sets(nafobj)
        del reader, nafobj

        add_word_numbers(sentences)

        CorefConverter(
            sentences,
            uniqueyfy=uniqueyfy,
            fill_spans=fill_non_consecutive_coref_spans,
        ).add_data_from_coref_sets(coref_sets)
        del coref_sets

        sentences = filter(sentence_filter, sentences)

        # Save the data to CoNLL
        cls.write_conll(filename=output_file,
                        writer=CoNLLWriter(
                            defaults=conll_defaults,
                            min_column_spacing=min_column_spacing,
                            on_missing=on_missing,
                            columns=conll_columns),
                        document_id=document_id,
                        sentences=sentences)
示例#26
0
def test_corenlp2naf():
    xml = open(os.path.join(os.path.dirname(__file__),
                            "test_corenlp.xml")).read()
    naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER)
    naf = KafNafParser(BytesIO(naf_bytes))
    terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()}
    assert_equal(
        set(terms.values()),
        {"John", "attack", "I", "in", "London", "hit", "he", "back", "."})
    london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0]
    assert_equal(london.get_pos(), 'R')
    assert_equal(london.get_morphofeat(), 'NNP')

    ents = {}
    for e in naf.get_entities():
        for ref in e.get_references():
            for term_id in ref.get_span().get_span_ids():
                ents[terms[term_id]] = e.get_type()
    assert_equal(ents, {"John": "PERSON", "London": "LOCATION"})

    deps = {
        terms[d.get_from()]: (d.get_function(), terms[d.get_to()])
        for d in naf.get_dependencies()
    }
    expected = {
        'I': ('nsubj', 'hit'),
        'John': ('nsubj', 'attack'),
        'London': ('prep_in', 'attack'),
        'back': ('advmod', 'hit'),
        'he': ('dobj', 'hit')
    }
    assert_equal(deps, expected)

    corefs = []
    for coref in naf.get_corefs():
        corefs.append(set())
        for span in coref.get_spans():
            corefs[-1] |= {terms[t] for t in span.get_span_ids()}
    assert_in({"John", "he"}, corefs)
def _test_file(this_file):
    input_fd = open(this_file)
    
    result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd)
    my_obj = KafNafParser(BytesIO(result))
       
    
    #Check the terms
    terms = [term for term in my_obj.get_terms()]
    assert_equal(len(terms),12)
    assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi')
    assert_equal(my_obj.get_term('t_4').get_pos(),'adj')
    
    
    #Check constituents
    trees = [tree for tree in my_obj.get_trees()]
    assert_equal(len(trees),2)
    assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1'])
    
    #Check dependencies
    dependencies = [dep for dep in my_obj.get_dependencies()]
    assert_equal(len(dependencies),10)
    assert_equal(dependencies[5].get_function(),'hd/su')
示例#28
0
def test_header():
    """
    Do the functions to set header attributes work correctly?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    naf.header = CHeader(type=naf.type)
    naf.root.insert(0, naf.header.get_node())

    naf.header.set_uri("http://example.com")
    assert_equal("http://example.com", naf.header.get_uri())
    naf.header.set_publicId("123")
    assert_equal("123", naf.header.get_publicId())

    # test if properties are serialized/deserialized correctly
    b = BytesIO()
    naf.dump(b)
    b.seek(0)
    naf2 = KafNafParser(b, type="NAF")
    assert_equal("http://example.com", naf2.header.get_uri())
    assert_equal("123", naf2.header.get_publicId())
示例#29
0
from KafNafParserPy import KafNafParser

if __name__ == '__main__':

    files = []
    fd = open('nl.list.test')
    for line in fd:
        files.append(line.strip())
    fd.close()

    my_polarity_classifier = PolarityClassifier('nl')
    my_polarity_classifier.load_models(sys.argv[1])

    OK = WR = 1
    for example_file in files:
        this_obj = KafNafParser(example_file)

        my_polarity_classifier.classify_kaf_naf_object(this_obj)
        this_obj.dump()

        break

        GOLD = {}
        list_ids_term_ids = []
        for opinion in this_obj.get_opinions():
            op_exp = opinion.get_expression()
            polarity = op_exp.get_polarity()
            term_ids = op_exp.get_span().get_span_ids()
            list_ids_term_ids.append((opinion.get_id(), term_ids))
            GOLD[opinion.get_id()] = polarity
示例#30
0
def get_terms_in_sentence(naf: KafNafParser, sent: int) -> Iterable[Cterm]:
    tokens = sort_tokens(t for t in naf.get_tokens() if t.get_sent() == sent)
    tokenids = [t.get_id() for t in tokens]
    return sort_terms(
        naf, [naf.get_term(tid) for tid in naf.map_tokens_to_terms(tokenids)])
示例#31
0
def find_terms(naf: KafNafParser, words: Sequence[str]) -> Iterable[Cterm]:
    """Find all terms whose lemma or word form is in the list of words"""
    for t in naf.get_terms():
        if t.get_lemma() in words or get_word(naf, t) in words:
            yield t
示例#32
0
def add_file(filename, data_lexelt, reftype='lexical_key'):
    obj = KafNafParser(filename)
    tokens_per_sent = {}
    sent_for_token = {}
    sents_in_order = []
    for token in obj.get_tokens():
        sentid = token.get_sent()
        if sentid not in sents_in_order:
            sents_in_order.append(sentid)
        sent_for_token[token.get_id()] = sentid
        if sentid not in tokens_per_sent: tokens_per_sent[sentid] = []
        tokens_per_sent[sentid].append((token.get_id(), token.get_text()))

    annotated_lemmas = []  # LIST of (full_id, token ids, lemma,pos,synset)
    for term in obj.get_terms():
        synset_label = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_reftype() == 'lexical_key':
                synset_label = term.get_lemma() + '%' + ext_ref.get_reference()
            elif ext_ref.get_reftype() == 'sense' and ext_ref.get_resource(
            ) == 'WordNet-3.0':
                synset_label = ext_ref.get_reference()
            if synset_label is not None:
                break

        if synset_label is not None:
            annotated_lemmas.append(
                (filename + '#' + term.get_id(),
                 term.get_span().get_span_ids(), term.get_lemma(),
                 term.get_pos(), synset_label))

    for full_id, token_ids, lemma, pos, synset_label in annotated_lemmas:
        #CREATE NEW INSTANCE

        this_key = lemma + '.' + pos.lower()[0]
        if this_key not in data_lexelt:
            data_lexelt[this_key] = Clexelt(this_key, pos)

        if not data_lexelt[this_key].exists(full_id):
            #Create the new instance
            new_instance = Cinstance()
            new_instance.id = full_id
            new_instance.docsrc = filename
            new_instance.key = synset_label

            tokens = []
            target_indexes = []
            this_sent = sent_for_token[token_ids[0]]
            index = sents_in_order.index(this_sent)
            start_idx = max(index - 2, 0)
            end_idx = min(index + 2, len(sents_in_order) - 1)
            selected_sents = sents_in_order[start_idx:end_idx + 1]
            num_token = 0
            for current_sent in selected_sents:
                for token_id, token_text in tokens_per_sent[str(current_sent)]:
                    tokens.append(token_text)
                    if token_id in token_ids:
                        target_indexes.append(num_token)
                    num_token += 1

            new_instance.tokens = tokens[:]
            new_instance.index_head = target_indexes[:]
            data_lexelt[this_key].add_instance(new_instance)
示例#33
0
def main(inputfile,
         this_type,
         folder,
         overall_parameters={},
         detected_dse={},
         log=False):
    files = []
    output_fd = None
    if this_type == 'train':
        output_fd = open(folder + '/' + TRAINING_FILENAME, 'w')

        ##Save the parametes
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename, 'w')
        pickler.dump(overall_parameters, fd_parameter, protocol=0)
        print >> sys.stderr, 'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()

        #Input is a files with a list of files
        fin = open(inputfile, 'r')
        for line in fin:
            files.append(line.strip())
        fin.close()

    elif this_type == 'tag':
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_param = open(parameter_filename, 'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)

        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif this_type == 'test':
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_param = open(parameter_filename, 'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val

        #Input is a files with a list of files
        fin = open(inputfile, 'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder + '/' + TESTING_FILENAME, 'w')

    gold_fd = None
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename, 'w')

    for filename in files:
        if log:
            print >> sys.stderr, 'HOLDER: processing file', filename

        if isinstance(filename, KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)

        create_structures(naf_obj, filename)

        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)

        num_opinions = 0

        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                if p != 'NON-OPINIONATED':
                    #if p.startswith('D-'):
                    holder = opinion.get_holder()
                    if holder is not None:
                        span = holder.get_span()
                        if span is not None:
                            span_ids = span.get_span_ids()
                            if len(span_ids) != 0:
                                sentence_id = get_sentence_id_for_opinion(
                                    naf_obj, opinion)
                                if sentence_id is not None:
                                    opinions_per_sentence[sentence_id].append(
                                        opinion)
                                    num_opinions += 1

        if log:
            print >> sys.stderr, '\tNum of opinions:', num_opinions

        if this_type == 'train':
            # For the train a sequence is created for every opinion
            #One sequence is created for every DSE (possible to have repeated sentences)
            sentences_with_opinions = set()
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    sentences_with_opinions.add(this_sentence)
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion,
                                    output=output_fd)

            #Include the rest of sentence without opinions
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id not in sentences_with_opinions:
                    create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[])
            '''

        elif this_type == 'tag':
            # Obtain the opinions per sentence per
            opinions_per_sentence = defaultdict(list)
            for list_name_ids, list_words in detected_dse:
                list_ids = [v[v.rfind('#') + 1:] for v in list_name_ids]
                first_token = naf_obj.get_token(list_ids[0])
                sentence_for_opinion = first_token.get_sent()
                opinions_per_sentence[sentence_for_opinion].append(list_ids)

            for this_sentence, these_opinions in opinions_per_sentence.items():
                for list_dse_token_ids in these_opinions:
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion=list_dse_token_ids,
                                    output=output_fd,
                                    log=log)

        elif this_type == 'test':
            opinion_list = []
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id in opinions_per_sentence:
                    for this_sentence, these_opinions in opinions_per_sentence.items():
                        for opinion in these_opinions:
                            create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd)
                            opinion_list.append(opinion)
                else:
                    create_sequence(naf_obj, this_type, sentence_id, overall_parameters,opinion=None, output = output_fd)
               
            '''
            #For the testing, one sequence is created for every sentence, with no opinion included
            opinion_list = []
            #WE include only the the sentences where there are opinions
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion,
                                    output=output_fd)
                    opinion_list.append(opinion)

            ## Create the gold standard data also
            if gold_fd is not None:
                create_gold_standard_holder(naf_obj, opinion_list, gold_fd)

    if gold_fd is not None:
        gold_fd.close()
        print >> sys.stderr, 'Gold standard in the file %s' % gold_fd.name

    return output_fd.name
def main(inputfile, this_type, folder, overall_parameters = {}, detected_dse = {},log=False):
    files = []
    output_fd = None
    if this_type == 'train':
        output_fd = open(folder+'/'+TRAINING_FILENAME,'w')
            
        ##Save the parametes
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename,'w')
        pickler.dump(overall_parameters,fd_parameter,protocol=0)
        print>>sys.stderr,'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        
    elif this_type == 'tag':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)
        
        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif this_type == 'test':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder+'/'+TESTING_FILENAME,'w')
     
      
    gold_fd = None    
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename ,'w')
          

    for filename in files:
        if log:
            print>>sys.stderr,'TARGET: processing file', filename
        
        if isinstance(filename,KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)
            
        create_structures(naf_obj, filename)
        
        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)
        num_opinions = 0
       
        
        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                if p != 'NON-OPINIONATED':
                    target = opinion.get_target()
                    if target is not None:  
                        span = target.get_span()
                        if span is not None:
                            S = span.get_span_ids()
                            if len(S) != 0:    
                                sentence_id = get_sentence_id_for_opinion(naf_obj,opinion)
                                if sentence_id is not None:
                                    opinions_per_sentence[sentence_id].append(opinion)
                                    num_opinions += 1
                    
        if log:
            print>>sys.stderr,'\tNum of opinions:', num_opinions
        
        if this_type == 'train':
            # For the train a sequence is created for every opinion
            #One sequence is created for every DSE (possible to have repeated sentences)
            sentences_with_opinions = set()
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    sentences_with_opinions.add(this_sentence)
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output = output_fd)
            
            #Include the rest of sentence without opinions
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id not in sentences_with_opinions:
                    create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[])
            '''
                
        elif this_type=='tag':
            # Obtain the opinions per sentence per
            opinions_per_sentence = defaultdict(list)
            for list_name_ids, list_words in detected_dse:
                list_ids = [v[v.rfind('#')+1:] for v in list_name_ids]
                first_token = naf_obj.get_token(list_ids[0])
                sentence_for_opinion = first_token.get_sent()
                opinions_per_sentence[sentence_for_opinion].append(list_ids)
                
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for list_dse_token_ids in these_opinions:
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion = list_dse_token_ids, output = output_fd,log=log)  

        elif this_type=='test':
            #For the testing, one sequence is created for every sentence, with no opinion included
            opinion_list = []
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd)
                    opinion_list.append(opinion)
   
            if gold_fd is not None:
                create_gold_standard_target(naf_obj,opinion_list,gold_fd)
            
            
    if gold_fd is not None:
        gold_fd.close() 
        print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name
        
    return output_fd.name 
示例#35
0
def nafobj(naffile_coref):
    from KafNafParserPy import KafNafParser
    return KafNafParser(naffile_coref)
示例#36
0
def get_naf(input_filename):
    try:
        naf = KafNafParser(input_filename)
    except XMLSyntaxError:
        with open(input_filename) as input_file:
            input = input_file.read()
        if "<NAF" in input and "</NAF>" in input:
            # I'm guessing this should be a NAF file but something is wrong
            logger.exception("Error parsing NAF file")
            raise
        naf = KafNafParser(type="NAF")
        naf.set_version("3.0")
        naf.set_language("nl")
        naf.lang = "nl"
        naf.raw = input
        naf.set_raw(naf.raw)
    return naf
def create_training_sentences(folder_tag_in,folder_kaf_in, opinion_layers,non_opinion,folder_out):
    #Remove the outputfolder if exists and create it again
    if os.path.exists(folder_out):
        shutil.rmtree(folder_out)
    os.mkdir(folder_out)
    total_sents_opi = total_sents_no_opi = 0
        
    for tag_file in glob.glob(os.path.join(folder_tag_in,'*.tag')):
        basename = os.path.basename(tag_file).replace('.tag','')
        kaf_file = os.path.join(folder_kaf_in,basename+'.kaf')
        if os.path.exists(kaf_file):
            ##From the tag file we extract the token ids for opinions and for non opinionated
            opinion_wids = set()        #token ids annotated as opinions
            no_opinion_wids = set()     #token ids annotated as no opinions
            
            fd = open(tag_file,'rb')
            for line in fd:
                fields = line.strip().split('\t')
                wid = fields[0]
                for opinion_idx in opinion_layers:
                    if fields[opinion_idx] == 'Opinion':
                        opinion_wids.add(wid)
                    
                    if non_opinion is not None and fields[non_opinion] == 'NON-OPINIONATED':
                        no_opinion_wids.add(wid)
                        
            fd.close()
            #########
            
            ###
            # Obtain the sentences that are opinionated (positive) and not (negative)
            # The negatives are:
            # If there are non-opinionated:  just the non opinionated
            # If not --> all the rest that are not positive
            #####
            sentences = {}
            all_sent_ids = set()
            sent_for_token_id = {}
            kaf_obj = KafNafParser(kaf_file)
            for token in kaf_obj.get_tokens():
                token_id = token.get_id()
                sent_id = token.get_sent()
                token_value = token.get_text()
                
                if sent_id not in sentences:
                    sentences[sent_id] = []
                sentences[sent_id].append(token_value)
                
                all_sent_ids.add(sent_id)
                
                sent_for_token_id[token_id] = sent_id
            ###
            
            positive_sents = set()
            negative_sents = set()
            
            ##Positive sents are the sentences for the opinion_ids
            for token_id in opinion_wids:
                positive_sents.add(sent_for_token_id[token_id])
            ####
            
            #Negative sents
            if non_opinion is not None:
                #In this case the negative are just the sentence of the no_opinion_wids
                for token_id in no_opinion_wids:
                    negative_sents.add(sent_for_token_id[token_id])
            else:
                #In this case the negative are all the sentences but the positive ones
                negative_sents = all_sent_ids - positive_sents
                
            #Free some memory    
            del opinion_wids
            del no_opinion_wids
            del kaf_obj
            
            ##Store the results in the file
            output_file = os.path.join(folder_out,basename+'.sents')
            fd_out = open(output_file,'w')
            fd_out.write('#'+tag_file+'\n')
            for sent_id in sorted(list(positive_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('+ '+text.encode('utf-8')+'\n')
                
            for sent_id in sorted(list(negative_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('- '+text.encode('utf-8')+'\n')
            fd_out.close()
            
            #print 'Processed ',basename
            #print '   Subjective sents:',len(positive_sents)
            #print '   Non subje. sents:',len(negative_sents)
            total_sents_opi += len(positive_sents)
            total_sents_no_opi += len(negative_sents)
        else:
            print 'KAF FILE NOT FOUND',kaf_file
    return total_sents_opi, total_sents_no_opi
示例#38
0
from KafNafParserPy import KafNafParser
import sys

if __name__ == '__main__':

    #Load Wordnet
    synset_for_skey = {}
    path_to_index_sense = '/home/izquierdo/wordnets/wordnet-3.0/dict/index.sense'
    fd = open(path_to_index_sense)
    for line in fd:
        fields = line.split()
        synset_for_skey[fields[0]] = fields[1]
    fd.close()

    naf_obj = KafNafParser(sys.stdin)

    for term in naf_obj.get_terms():
        this_skey = None
        this_synset = None
        ref_skey = ref_synset = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_reftype() == 'sense':
                this_skey = ext_ref.get_reference()
                ref_skey = ext_ref
            if ext_ref.get_reftype() == 'ilidef':
                this_synset = ext_ref.get_reference()
                ref_synset = ext_ref

        if this_synset == '':
            print >> sys.stderr, term.get_id()
    
    
        

if __name__ == '__main__':
    import glob
    #feature_file = 'my_feat_file'
    #fd = open(feature_file,'w')
    #for kaf_file in glob.glob('/home/izquierdo/data/opinion_annotations_en/kaf/hotel/*.kaf'):
    #    print kaf_file
    #    knaf_obj = KafNafParser(kaf_file)
    #    extract_features_polarity_classifier_from_kaf(knaf_obj, fd)
    #fd.close()
    #print ' Feature file in ',feature_file
    #train_polarity_classifier(feature_file)
    kaf_obj = KafNafParser('dutch00011_f1b91e00bddbf62fbb35e4755e786406.kaf')
    list_terms = []
    list_ids = []
    for opinion in kaf_obj.get_opinions():
        exp = opinion.get_expression()
        pol = exp.get_polarity()
        if pol in ['Positive','Negative','StrongPositive','StrongNegative']:
            this_id = (opinion.get_id(),pol)
            ids = exp.get_span().get_span_ids()
            list_ids.append(this_id)
            list_terms.append(ids)
    index_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/index.features'
    model_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/model.svm'
    svm_path = '/home/izquierdo/bin/svm_classify'
    results = classify(kaf_obj,list_terms,index_filename,model_filename, svm_path)
    for n in range(len(results)):
def extract_all_features():
    train_files = load_training_files()
    logging.debug('Loaded '+str(len(train_files))+' files')

    feat_folder = my_config_manager.get_feature_folder_name()
    label_feats = separator = None
    my_stdout, my_stderr = sys.stdout,sys.stderr
    
    rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename()
    exp_tar_rel_fic = open(rel_exp_tar_filename,'w')
   
    rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename()
    exp_hol_rel_fic = open(rel_exp_hol_filename,'w') 
    
    ### LEXICON FROM THE DOMAIN
    expressions_lexicon = None
    targets_lexicon = None
    if my_config_manager.get_use_training_lexicons():
        # Create the lexicons
        
        ##GUESS THE LANG:
        first_train_file = train_files[0]
        obj = KafNafParser(first_train_file)
        lang = obj.get_language()
        
        expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
        target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
        
        
        this_exp_lex = my_config_manager.get_use_this_expression_lexicon()            
        this_tar_lex = my_config_manager.get_use_this_target_lexicon()

        
        if this_exp_lex is None or this_tar_lex is None:
            path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py'
            training_filename = my_config_manager.get_file_training_list()
            lexicons_manager.create_lexicons(path_to_lex_creator,training_filename,expression_lexicon_filename,target_lexicon_filename)
        
        ##Once created we have to copy the previous one in case:
        if this_exp_lex is not None:
            if "$LANG" in this_exp_lex:
                this_exp_lex = this_exp_lex.replace('$LANG',lang)
            shutil.copy(this_exp_lex, expression_lexicon_filename)
            
        if this_tar_lex is not None:
            if "$LANG" in this_tar_lex:
                this_tar_lex = this_tar_lex.replace('$LANG',lang)
            shutil.copy(this_tar_lex,target_lexicon_filename)
        
        expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
        targets_lexicon =  lexicons_manager.load_lexicon(target_lexicon_filename)
        
        this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name()
        if this_propagation_lexicon is not None:
            if "$LANG" in this_propagation_lexicon:
                this_propagation_lexicon = this_propagation_lexicon.replace('$LANG',lang)
                
        print>>sys.stderr,'Propagated lexicon',this_propagation_lexicon
        
        
        

    ## Configuration for the relational alcasifier
    use_deps_now = my_config_manager.get_use_dependencies()
    use_toks_lems_now = my_config_manager.get_use_tokens_lemmas()
      
    accepted_opinions = my_config_manager.get_mapping_valid_opinions()
    use_dependencies_now = my_config_manager.get_use_dependencies()
    polarities_found_and_skipped = []
    for num_file, train_file in enumerate(train_files):
        logging.debug('Extracting features '+os.path.basename(train_file))
        base_name = os.path.basename(train_file)
        out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat")
        err_file = out_file+'.log'
        
        #Creates the output file
        # Returns the labels for the features and the separator used
        if True:
            kaf_naf_obj = KafNafParser(train_file)
            
            label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, 
                                                                                           accepted_opinions=accepted_opinions, 
                                                                                           exp_lex=expressions_lexicon, 
                                                                                           tar_lex=targets_lexicon,
                                                                                           propagation_lex_filename=this_propagation_lexicon)
            polarities_found_and_skipped.extend(pols_skipped_this)
            print>>exp_tar_rel_fic,'#'+train_file
            print>>exp_hol_rel_fic,'#'+train_file
            # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations 
            create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
            create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
        if False:
        #except Exception as e:
            sys.stdout, sys.stderr = my_stdout, my_stderr
            print>>sys.stderr,str(e),dir(e)
            pass
        
    ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed
    count = defaultdict(int)
    for exp_label in polarities_found_and_skipped:
        count[exp_label] += 1
    info = '\nOpinions skipped because the polarity label is not included in the configuration\n'
    info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n'
    info += 'Number of complete opinions skipped\n'
    for label, c in count.items():
        info+=' '+label+' :'+str(c)+'\n'
    info+='\n'
    logging.debug(info)
    ###################################################
    
    
        
    #Re-set the stdout and stderr
    exp_tar_rel_fic.close()
    exp_hol_rel_fic.close()
    
    sys.stdout,sys.stderr = my_stdout, my_stderr
    #Sabe labelfeats and separator in a file
    filename = my_config_manager.get_feature_desc_filename()
    fic = open(filename,'w')
    fic.write(' '.join(label_feats)+'\n')
    fic.close()
    logging.debug('Description of features --> '+filename)
def main(inputfile, type, folder, overall_parameters={},log=False):
    files = []
    output_fd = None
    if type == 'train':
        if not os.path.isdir(folder):
            os.mkdir(folder)
        res_fol = os.path.join(folder,RESOURCES_FOLDER)
        if not os.path.isdir(res_fol):
            os.mkdir(res_fol)
        output_fd = open(folder+'/'+TRAINING_FILENAME,'w')
            
        ##Save the parametes
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename,'w')
        pickler.dump(overall_parameters,fd_parameter,protocol=0)
        print('Parameters saved to file %s' % parameter_filename, file=sys.stderr)
        fd_parameter.close()
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        
    elif type == 'tag':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'rb')
        try:
            overall_parameters = pickler.load(fd_param,encoding='bytes')
        except TypeError:
            overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)
        
        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif type == 'test':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in list(these_overall_parameters.items()):
            overall_parameters[opt] = val
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder+'/'+TESTING_FILENAME,'w')
        
          
    ##Load the sentiment-nva-gi42.txt
    ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42()  
    
    
    ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000()

    ###if overall_parameters['use_mpqa_lexicon']:
    from mpqa_lexicon import MPQA_subjectivity_lexicon
    overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon()
    
    
    if overall_parameters.get('use_wordnet_lexicon', False):
        from wordnet_lexicon import WordnetLexicon
        wordnet_lexicon_expression = WordnetLexicon()
        complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) 

        if type == 'train':
            #We create it from the training files
            print('Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename), file=sys.stderr)
            wordnet_lexicon_expression.create_from_files(files,'expression')
            wordnet_lexicon_expression.save_to_file(complete_wn_filename)
        else:
            #READ IT
            wordnet_lexicon_expression.load_from_file(complete_wn_filename)
        overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression
        
    gold_fd = None    
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename ,'w')
          
    #Processing every file
    
    #### FOR THE CUSTOM LEXICON
    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl')
    ###########################

    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_for_language('it')
    
    for filename in files:
        if log:
            print('EXPRESSION: processing file', filename, file=sys.stderr)
        
        if isinstance(filename,KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)
        create_structures(naf_obj, filename)
        
        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)
        num_opinions = 0
        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                
                if p != 'NON-OPINIONATED':
                    #if p.startswith('D-'):           
                    sentence_id = get_sentence_id_for_opinion(naf_obj,opinion)
                    if sentence_id is not None:
                        opinions_per_sentence[sentence_id].append(opinion)
                        num_opinions += 1
        if log:
            print('\tNum of opinions:', num_opinions, file=sys.stderr)
        
        
        if type == 'train':
            ############################
            # One sequence per sentence
            ############################
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    ##Only sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd)
        elif type == 'test':
            #TESTING CASE
            #For the testing, one sequence is created for every sentence
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    #Only tested on sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd)
                    
            ## Create the gold standard data also
            opinion_list = []
            for this_sentence, these_opinions in list(opinions_per_sentence.items()):
                opinion_list.extend(these_opinions)
            if gold_fd is not None:
                create_gold_standard(naf_obj,opinion_list,gold_fd)
        elif type == 'tag':
            #TAGGING CASE
            # All the sentences are considered
            for sentence_id in naf_obj.list_sentence_ids:
                create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log)
            
            
    if gold_fd is not None:
        gold_fd.close() 
        print('Gold standard in the file %s' % gold_fd.name, file=sys.stderr)
        
    output_fd.close()
    return output_fd.name
示例#42
0
文件: corenlp.py 项目: amcat/nlpipe
def corenlp2naf(xml_bytes, annotators):
    """
    Call from on the text and return a Naf object
    """
    naf = KafNafParser(type="NAF")

    try:
        doc = Document(xml_bytes)
    except:
        log.exception("Error on parsing xml")
        raise

    terms = {} # (xml_sentid, xml_tokenid) : term
    for sent in doc.sentences:
        for t in sent.tokens:
            wf = naf.create_wf(t.word, sent.id, t.character_offset_begin)
            term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf])
            terms[sent.id, t.id] = term
            if t.ner not in (None, 'O'):
                naf.create_entity(t.ner, [term.get_id()])
        if sent.collapsed_ccprocessed_dependencies:
            dependencies = True
            for dep in sent.collapsed_ccprocessed_dependencies.links:
                if dep.type != 'root':
                    child = terms[sent.id, dep.dependent.idx]
                    parent = terms[sent.id, dep.governor.idx]
                    comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma())
                    naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment)

    if doc.coreferences:
        for coref in doc.coreferences:
            cterms = set()
            for m in coref.mentions:
                cterms |= {terms[m.sentence.id, t.id].get_id() for t in m.tokens}
            naf.create_coreference("term", cterms)
        
    for annotator in annotators:
        if annotator in LAYERMAP:
            naf.create_linguistic_processor(LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()),
                                            get_corenlp_version())
    s = BytesIO()
    naf.dump(s)
    return s.getvalue()
示例#43
0
def get_naf_from_sentences(sentences):
    naf_obj = KafNafParser(type="NAF")
    naf_obj.set_version("3.0")
    naf_obj.set_language("nl")
    naf_obj.lang = "nl"
    naf_obj.raw = '\n'.join([' '.join(s) for s in sentences])
    naf_obj.set_raw(naf_obj.raw)
    # Create text layer
    wcount = 1
    offsets = {}
    txt = naf_obj.get_raw()
    token_ids = []
    for sid, sentence in enumerate(sentences):
        token_ids_sub = []
        for token in sentence:
            token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type())
            token_id = 'w{}'.format(wcount)
            token_length = len(token)
            offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0))
            token_obj.set_id(token_id)
            token_obj.set_length(str(token_length))
            # token_obj.set_offset(str(offset)) # Is this correct????
            token_obj.set_para('1')
            token_obj.set_sent(str(sid + 1))
            token_obj.set_text(token)
            token_obj.set_offset(str(offsets[wcount]))
            token_ids_sub.append(token_id)
            wcount += 1
            naf_obj.add_wf(token_obj)
        token_ids.append(token_ids_sub)
    # Create term layers
    term_ids = []
    count_terms = 0
    for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)):
        term_ids_sub = []
        logger.info('Creating the term layer...')
        for num_token, (token,
                        token_id) in enumerate(zip(sentence, token_ids_sub)):
            new_term_id = 't_' + str(count_terms)
            count_terms += 1
            term_ids_sub.append(new_term_id)
            term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type())
            term_obj.set_id(new_term_id)
            new_span = KafNafParserPy.Cspan()
            new_span.create_from_ids([token_id])
            term_obj.set_span(new_span)
            naf_obj.add_term(term_obj)
        term_ids.append(term_ids_sub)

    return naf_obj, term_ids
		map[fields[0]] = fields[1]
	fic.close()
	return map



if __name__=='__main__':
	this_folder = os.path.dirname(os.path.realpath(__file__))

	if sys.stdin.isatty():
			print>>sys.stderr,'Input stream required.'
			print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0]
			sys.exit(-1)


	input_obj = KafNafParser(sys.stdin)
	my_lang = input_obj.get_language()

	complete_path_to_treetagger = find_treetagger()
	if complete_path_to_treetagger is None:
		print>>sys.stderr,'Treetagger could not be found. You need to specify there treetagger is installed in 2 ways:'
		print>>sys.stderr,'\t1)Update the TREE_TAGGER_PATH variable in the file lib/__init__.py'
		print>>sys.stderr,'\t2_Update your TREE_TAGGER_PATH environment variable'
		sys.exit(0)
        
        
    # In the last version of treetagger all the names of commands have been change from X-utf to just X
    # /cmd/tree-tagger-english-utf8 ==> /cmd/tree-tagger-english
    # This could be a problem in case other version of treetagger is being used.
	if my_lang == 'en':
		treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-english'
示例#45
0
文件: nlpamcat.py 项目: mcomsa/nlpipe
def _get_text(a, to_naf=False, lang='nl'):
    result = "\n\n".join([_normalize(a[x]) for x in ('headline', 'text')])
    if to_naf:
        naf = KafNafParser(type="NAF")
        naf.header = CHeader(type=naf.type)
        naf.root.insert(0, naf.header.get_node())

        naf.set_language(lang)
        naf.set_raw(result)
        naf.set_version("3.0")

        fd = CfileDesc()
        if 'author' in a:
            fd.set_author(a['author'])
        if 'headline' in a:
            fd.set_title(a['headline'])
        if 'date' in a:
            fd.set_creationtime(a['date'])
        if 'medium' in a:
            fd.set_magazine(a['medium'])
        if 'page' in a:
            fd.set_pages(str(a['page']))
        if 'section' in a:
            fd.set_section(a['section'])
        naf.header.set_fileDesc(fd)

        naf.header.set_publicId(a['uuid'])
        #if 'url' in a:
        #    naf.header.set_uri(a['url'])
        b = BytesIO()
        naf.dump(b)
        result = b.getvalue().decode("utf-8")
    return result
示例#46
0
from KafNafParserPy import KafNafParser, Clp, Crole, Cspan, Cpredicate
from KafNafParserPy.span_data import Ctarget
import sys
import datetime
import time
import sys
import datetime
import time
import pprint
import re

# Make sure you get the order of the input files right
nafinput = sys.argv[1]
timblpredictions = sys.argv[2]

my_parser = KafNafParser(nafinput)

## Create header info
lp = Clp()
lp.set_name('SoNaR-News-trained-SRL')
lp.set_version('1.1')
lp.set_timestamp()
my_parser.add_linguistic_processor('srl', lp)

# If the naf file already contains predicates, store those to make sure
# you don't overwrite them or create new predicate elements for existing predicates
roles = []
predicate_spans = []
for predicate in my_parser.get_predicates():
    for role in predicate.get_roles():
        role_id = role.get_id()
示例#47
0
def main(inputfile, type, folder, overall_parameters={},log=False):
    files = []
    output_fd = None
    if type == 'train':
        if not os.path.isdir(folder):
            os.mkdir(folder)
        res_fol = os.path.join(folder,RESOURCES_FOLDER)
        if not os.path.isdir(res_fol):
            os.mkdir(res_fol)
        output_fd = open(folder+'/'+TRAINING_FILENAME,'w')
            
        ##Save the parametes
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename,'w')
        pickler.dump(overall_parameters,fd_parameter,protocol=0)
        print>>sys.stderr,'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        
    elif type == 'tag':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)
        
        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif type == 'test':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder+'/'+TESTING_FILENAME,'w')
        
          
    ##Load the sentiment-nva-gi42.txt
    ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42()  
    
    
    ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000()

    ###if overall_parameters['use_mpqa_lexicon']:
    from mpqa_lexicon import MPQA_subjectivity_lexicon
    overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon()
    
    
    if overall_parameters.get('use_wordnet_lexicon', False):
        from wordnet_lexicon import WordnetLexicon
        wordnet_lexicon_expression = WordnetLexicon()
        complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) 

        if type == 'train':
            #We create it from the training files
            print>>sys.stderr,'Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename)
            wordnet_lexicon_expression.create_from_files(files,'expression')
            wordnet_lexicon_expression.save_to_file(complete_wn_filename)
        else:
            #READ IT
            wordnet_lexicon_expression.load_from_file(complete_wn_filename)
        overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression
        
    gold_fd = None    
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename ,'w')
          
    #Processing every file
    
    #### FOR THE CUSTOM LEXICON
    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl')
    ###########################

    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_for_language('it')
    
    for filename in files:
        if log:
            print>>sys.stderr,'EXPRESSION: processing file', filename
        
        if isinstance(filename,KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)
        create_structures(naf_obj, filename)
        
        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)
        num_opinions = 0
        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                
                if p != 'NON-OPINIONATED':
                    #if p.startswith('D-'):           
                    sentence_id = get_sentence_id_for_opinion(naf_obj,opinion)
                    if sentence_id is not None:
                        opinions_per_sentence[sentence_id].append(opinion)
                        num_opinions += 1
        if log:
            print>>sys.stderr,'\tNum of opinions:', num_opinions
        
        
        if type == 'train':
            ############################
            # One sequence per sentence
            ############################
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    ##Only sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd)
        elif type == 'test':
            #TESTING CASE
            #For the testing, one sequence is created for every sentence
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    #Only tested on sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd)
                    
            ## Create the gold standard data also
            opinion_list = []
            for this_sentence, these_opinions in opinions_per_sentence.items():
                opinion_list.extend(these_opinions)
            if gold_fd is not None:
                create_gold_standard(naf_obj,opinion_list,gold_fd)
        elif type == 'tag':
            #TAGGING CASE
            # All the sentences are considered
            for sentence_id in naf_obj.list_sentence_ids:
                create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log)
            
            
    if gold_fd is not None:
        gold_fd.close() 
        print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name
        
    output_fd.close()
    return output_fd.name
示例#48
0
def create_training_sentences(folder_tag_in, folder_kaf_in, opinion_layers,
                              non_opinion, folder_out):
    #Remove the outputfolder if exists and create it again
    if os.path.exists(folder_out):
        shutil.rmtree(folder_out)
    os.mkdir(folder_out)
    total_sents_opi = total_sents_no_opi = 0

    for tag_file in glob.glob(os.path.join(folder_tag_in, '*.tag')):
        basename = os.path.basename(tag_file).replace('.tag', '')
        kaf_file = os.path.join(folder_kaf_in, basename + '.kaf')
        if os.path.exists(kaf_file):
            ##From the tag file we extract the token ids for opinions and for non opinionated
            opinion_wids = set()  #token ids annotated as opinions
            no_opinion_wids = set()  #token ids annotated as no opinions

            fd = open(tag_file, 'rb')
            for line in fd:
                fields = line.strip().split('\t')
                wid = fields[0]
                for opinion_idx in opinion_layers:
                    if fields[opinion_idx] == 'Opinion':
                        opinion_wids.add(wid)

                    if non_opinion is not None and fields[
                            non_opinion] == 'NON-OPINIONATED':
                        no_opinion_wids.add(wid)

            fd.close()
            #########

            ###
            # Obtain the sentences that are opinionated (positive) and not (negative)
            # The negatives are:
            # If there are non-opinionated:  just the non opinionated
            # If not --> all the rest that are not positive
            #####
            sentences = {}
            all_sent_ids = set()
            sent_for_token_id = {}
            kaf_obj = KafNafParser(kaf_file)
            for token in kaf_obj.get_tokens():
                token_id = token.get_id()
                sent_id = token.get_sent()
                token_value = token.get_text()

                if sent_id not in sentences:
                    sentences[sent_id] = []
                sentences[sent_id].append(token_value)

                all_sent_ids.add(sent_id)

                sent_for_token_id[token_id] = sent_id
            ###

            positive_sents = set()
            negative_sents = set()

            ##Positive sents are the sentences for the opinion_ids
            for token_id in opinion_wids:
                positive_sents.add(sent_for_token_id[token_id])
            ####

            #Negative sents
            if non_opinion is not None:
                #In this case the negative are just the sentence of the no_opinion_wids
                for token_id in no_opinion_wids:
                    negative_sents.add(sent_for_token_id[token_id])
            else:
                #In this case the negative are all the sentences but the positive ones
                negative_sents = all_sent_ids - positive_sents

            #Free some memory
            del opinion_wids
            del no_opinion_wids
            del kaf_obj

            ##Store the results in the file
            output_file = os.path.join(folder_out, basename + '.sents')
            fd_out = open(output_file, 'w')
            fd_out.write('#' + tag_file + '\n')
            for sent_id in sorted(list(positive_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('+ ' + text.encode('utf-8') + '\n')

            for sent_id in sorted(list(negative_sents)):
                text = ' '.join(sentences[sent_id])
                fd_out.write('- ' + text.encode('utf-8') + '\n')
            fd_out.close()

            #print 'Processed ',basename
            #print '   Subjective sents:',len(positive_sents)
            #print '   Non subje. sents:',len(negative_sents)
            total_sents_opi += len(positive_sents)
            total_sents_no_opi += len(negative_sents)
        else:
            print 'KAF FILE NOT FOUND', kaf_file
    return total_sents_opi, total_sents_no_opi
    def train(self,list_training_files, out_folder):
        self.folder= out_folder
        os.mkdir(self.folder)
        print('Creating output folder %s' % self.folder)
        
        training_fd = open(os.path.join(self.folder,TRAIN_FILE),'w')
        
        
        for this_file in list_training_files:
            print('\tEncoding training file %s' % this_file)
            
            this_obj = KafNafParser(this_file)
            num_pos = num_neg = 0
            for opinion in this_obj.get_opinions():
                opinion_expression = opinion.get_expression()
                polarity = opinion_expression.get_polarity()
                
                span_obj = opinion_expression.get_span()
                if span_obj is None:
                    continue
                
                list_term_ids = span_obj.get_span_ids()
                features = self.extract_features(this_obj, list_term_ids)
                
            
                int_features = self.encode_string_features(features, update_index=True) #Map feat index --> frequency
                
                if len(int_features) != 0:                
                    this_class = None
                    if self.is_positive(polarity):
                        this_class = '+1'
                        num_pos += 1
                    elif self.is_negative(polarity):
                        this_class = '-1'
                        num_neg += 1
                    
                    if this_class is not None:
                        self.write_example_to_file(training_fd, this_class, int_features)

            #END FOR
            print('\t\tNum positive examples: %d' % num_pos)
            print('\t\tNum negative examples: %d' % num_neg)
        training_fd.close()
        print('Training file at %s' % training_fd.name)
        
        ##RUN THE TRAINING
        training_cmd = [SVM_LEARN]
        
        training_cmd.append(training_fd.name)
        
        whole_model_file = os.path.join(self.folder, MODEL_FILE)
        training_cmd.append(whole_model_file)
        ret_code = check_call(training_cmd)
        print('Training done on %s with code %d' % (whole_model_file,ret_code))
        
        #Save also the index
        whole_index_file = os.path.join(self.folder,INDEX_FILE)
        index_fd = open(whole_index_file,'wb')
        pickle.dump(self.index_features, index_fd, -1)
        index_fd.close()
        print('Feature index saved to %s with %d features' % (whole_index_file,len(self.index_features)))
示例#50
0
def extract_all_features():
    train_files = load_training_files()
    logging.debug('Loaded '+str(len(train_files))+' files')

    feat_folder = my_config_manager.get_feature_folder_name()
    label_feats = separator = None
    my_stdout, my_stderr = sys.stdout,sys.stderr
    
    rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename()
    exp_tar_rel_fic = open(rel_exp_tar_filename,'w')
   
    rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename()
    exp_hol_rel_fic = open(rel_exp_hol_filename,'w') 
    
    filename_features_polarity_classifier = my_config_manager.get_filename_features_polarity_classifier()
    fd_filename_features_polarity_classifier = open(filename_features_polarity_classifier,'w')
    
     

    ## Configuration for the relational alcasifier
    use_these_lexicons = []
    use_deps_now = my_config_manager.get_use_dependencies()
    use_toks_lems_now = my_config_manager.get_use_tokens_lemmas()
      
    #accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=OPINION_EXPRESSION)
    accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=None)
    mapping_positive_negative = my_config_manager.get_mapping_valid_opinions()
    use_dependencies_now = my_config_manager.get_use_dependencies()
    polarities_found_and_skipped = []
    for num_file, train_file in enumerate(train_files):
        logging.debug('Extracting features '+os.path.basename(train_file))
        base_name = os.path.basename(train_file)
        out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat")
        err_file = out_file+'.log'
        

        kaf_naf_obj = KafNafParser(train_file)
        print>>sys.stderr,'Extracting features from',train_file
        
        if num_file == 0: #The first time we load the lexicons
            lang = kaf_naf_obj.get_language()
            use_these_lexicons = load_lexicons(my_config_manager,lang)
            
        label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, 
                                                                                       accepted_opinions=accepted_opinions, 
                                                                                       lexicons = use_these_lexicons)
         
        polarities_found_and_skipped.extend(pols_skipped_this)
        print>>exp_tar_rel_fic,'#'+train_file
        print>>exp_hol_rel_fic,'#'+train_file
            
        # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations 
        # set it valid_opinions = accepted opinions for feiltering
        '''
        create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=None,
                                    use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,
                                    use_lemmas=use_toks_lems_now,
                                    log=err_file)
        
        create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=None,
                                    use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,
                                    use_lemmas=use_toks_lems_now)
            
        '''
        ##Extract features for the polarity classifier
        #for mpqa there will be no polarity classifier
        #extract_features_polarity_classifier_from_kaf(kaf_naf_obj,fd_filename_features_polarity_classifier,mapping_positive_negative)
        
    fd_filename_features_polarity_classifier.close()
    ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed
    count = defaultdict(int)
    for exp_label in polarities_found_and_skipped:
        count[exp_label] += 1
    info = '\nOpinions skipped because the polarity label is not included in the configuration\n'
    info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n'
    info += 'Number of complete opinions skipped\n'
    for label, c in count.items():
        info+=' '+label+' :'+str(c)+'\n'
    info+='\n'
    logging.debug(info)
    ###################################################
    
    
        
    #Re-set the stdout and stderr
    exp_tar_rel_fic.close()
    exp_hol_rel_fic.close()
    
    sys.stdout,sys.stderr = my_stdout, my_stderr
    #Sabe labelfeats and separator in a file
    filename = my_config_manager.get_feature_desc_filename()
    fic = open(filename,'w')
    fic.write(' '.join(label_feats)+'\n')
    fic.close()
    logging.debug('Description of features --> '+filename)
示例#51
0
文件: corenlp.py 项目: amcat/nlpipe
def corenlp2naf(xml_bytes, annotators):
    """
    Call from on the text and return a Naf object
    """
    naf = KafNafParser(type="NAF")

    try:
        doc = Document(xml_bytes)
    except:
        log.exception("Error on parsing xml")
        raise

    terms = {}  # (xml_sentid, xml_tokenid) : term
    for sent in doc.sentences:
        for t in sent.tokens:
            wf = naf.create_wf(t.word, sent.id, t.character_offset_begin)
            term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf])
            terms[sent.id, t.id] = term
            if t.ner not in (None, 'O'):
                naf.create_entity(t.ner, [term.get_id()])
        if sent.collapsed_ccprocessed_dependencies:
            dependencies = True
            for dep in sent.collapsed_ccprocessed_dependencies.links:
                if dep.type != 'root':
                    child = terms[sent.id, dep.dependent.idx]
                    parent = terms[sent.id, dep.governor.idx]
                    comment = "{t}({o}, {s})".format(s=child.get_lemma(),
                                                     t=dep.type,
                                                     o=parent.get_lemma())
                    naf.create_dependency(child.get_id(),
                                          parent.get_id(),
                                          dep.type,
                                          comment=comment)

    if doc.coreferences:
        for coref in doc.coreferences:
            cterms = set()
            for m in coref.mentions:
                cterms |= {
                    terms[m.sentence.id, t.id].get_id()
                    for t in m.tokens
                }
            naf.create_coreference("term", cterms)

    for annotator in annotators:
        if annotator in LAYERMAP:
            naf.create_linguistic_processor(
                LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()),
                get_corenlp_version())
    s = BytesIO()
    naf.dump(s)
    return s.getvalue()
def process_file(this_file,token_freq):
    xml_obj = KafNafParser(this_file)
    print>>sys.stderr,'Processing file',this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()
        
    
    ##Properties!
    aspects = [] ## [(label,term_span)...]
    
    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(),span.get_span_ids()))
       
    
    
    already_counted = {EXP:set(), TAR:set()}
    
    for opinion in xml_obj.get_opinions():   
        for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid,[]))
                    list_wids.sort(key=lambda wid: order_for_wid[wid])  ##Sorted according the the order of the tokens
                    
                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids)
                    opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids)
                    opinion_pos    = ' '.join( pos_for_wid[wid]   for wid in list_wids)
                    
                   
                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append((aspect_label,num_in_common,len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0]
                            opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos))
                        already_counted[this_type].add(string_wids)    
      
    del xml_obj
    print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions)
    print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets)
    print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text)
    return opinion_expressions, opinion_targets, whole_text

if __name__ == '__main__':
    
    files = []
    fd = open('nl.list.test')
    for line in fd:
        files.append(line.strip())
    fd.close()

    my_polarity_classifier = PolarityClassifier('nl')
    my_polarity_classifier.load_models(sys.argv[1])

    OK = WR = 1
    for example_file in files:
        this_obj = KafNafParser(example_file)
        
        
        my_polarity_classifier.classify_kaf_naf_object(this_obj)
        this_obj.dump()

        break
    
        GOLD = {}
        list_ids_term_ids = []
        for opinion in this_obj.get_opinions():
            op_exp = opinion.get_expression()
            polarity = op_exp.get_polarity()
            term_ids = op_exp.get_span().get_span_ids()
            list_ids_term_ids.append((opinion.get_id(),term_ids))
            GOLD[opinion.get_id()] = polarity
# Author: Marieke van Erp ([email protected])
# Date: 27 September 2014
#
# Update 23 February 2015: better constituent extraction for feature generation
# with help from Ruben Izquierdo


from KafNafParserPy import KafNafParser
import re
import sys 
from collections import OrderedDict
import codecs

input = sys.stdin

my_parser = KafNafParser(input)
	 
### We first need a list of the predicates that we want to create feature vectors for
predicates = {} 
for term_obj in my_parser.get_terms():
	predicate = re.match("WW", term_obj.get_morphofeat())
	if predicate is not None:
		predicates[term_obj.get_id()] = term_obj.get_pos()
		#print term_obj.get_id(), term_obj.get_morphofeat(), term_obj.get_lemma()

# We need the dependencies to find out the structure of the argument patterns
# and also to know which verbs are auxiliary verbs and which ones are main verbs
dependencies = {} 
for dep_obj in my_parser.get_dependencies():
	relparts = dep_obj.get_function().split('/')
	rel_from = relparts[0]