#sys.stdout.write("\r") #prog = ProgressBar(len(files)) i = 0 for f in files: if f.startswith("#"): continue #prog.update_time(i) #sys.stdout.write("\r%s" % (str(prog))) #sys.stdout.flush() i += 1 f = f.strip() doc = Document(f) gold_nps = reconcile.getNPs(f) gold_chains = reconcile.getGoldChains(f) doc.addGoldChains(gold_chains) for np in gold_nps: text = utils.textClean(np.getText().lower()).strip() if TRUE_PRONOUNS: if text in TRUE: add_stats(text, np, doc, nouns, head2text) else: if specificity_utils.isNominal(np): #head = getHead(text) #if head.endswith("%"): continue #skip percents #if head[-1].isdigit(): continue #skip numbers #if utils.isConj(head): continue #just skip these guys too add_stats(text, np, doc, nouns, head2text) #sys.stdout.write("\r \r\n")
files = [x for x in fileList.readlines() if not x.startswith("#")] fileList.close() for f in files: f = f.strip() print("Working on document: %s" % f) #load in the document statistics d = Document(f) #the total number of sentences in this text file. #?double check with nltk? total_sentences_doc = len(reconcile.getSentences(f)) #process a document, get all the nominal stats that are requested. gold_chains = reconcile.getGoldChains(f, True) d.addGoldChains(gold_chains) for gc in list(gold_chains.keys()): base_antecedent = True previous_semantic_tag = "" prev_sent = -1 prev_tile = -1 prev_type = "" for mention in gold_chains[gc]: if HEADS_ONLY: head_clean = ' '.join(map(string.strip, \ mention.getATTR("HEAD_TEXT").split())).strip() text_ident = head_clean.lower() text_ident = mention.getATTR("HEAD_TEXT").strip().lower() else: