def read_training_data(file_name): """ read kaf/naf and matches the aspects with the words """ parser = KafNafParser(PATH_ANNOTATED_DATA + file_name) terms = list(parser.get_terms()) # create token dictionairy containing naf info tokens_container = dict() for token_el in parser.get_tokens(): token_node = token_el.node token_id = token_node.get('wid').replace('w', 't') token_info = token_node.attrib tokens_container[token_id] = token_info properties = list(parser.get_properties()) handled_properties, term_dict = handle_properties(properties, terms, tokens_container) return terms, properties, handled_properties, term_dict, tokens_container
def process_file(this_file,token_freq): xml_obj = KafNafParser(this_file) print>>sys.stderr,'Processing file',this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(),span.get_span_ids())) already_counted = {EXP:set(), TAR:set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid,[])) list_wids.sort(key=lambda wid: order_for_wid[wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join( pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len(set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append((aspect_label,num_in_common,len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0] opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions) print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets) print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text) return opinion_expressions, opinion_targets, whole_text
def process_file(this_file, token_freq): xml_obj = KafNafParser(this_file) print >> sys.stderr, 'Processing file', this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(), span.get_span_ids())) already_counted = {EXP: set(), TAR: set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP, opinion.get_expression()), (TAR, opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity( ) == 'NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid, [])) list_wids.sort(key=lambda wid: order_for_wid[ wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join(token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join(lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join(pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append( (opinion_tokens, polarity, opinion_lemmas, opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len( set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append( (aspect_label, num_in_common, len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects, key=lambda t: (t[1], t[2]), reverse=True)[0][0] opinion_targets.append( (opinion_tokens, aspect_for_target, opinion_lemmas, opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print >> sys.stderr, '\tNumber of opinion expressions:', len( opinion_expressions) print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets) print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text) return opinion_expressions, opinion_targets, whole_text