예제 #1
0
파일: postagger.py 프로젝트: jnoring/Crackr
def tag(text):
    text = tp.preprocess(text)
    #print text
    t1 = time.time()
    outlist = post.tag(text.split())
    t2 = time.time()
    print "POS Tagging complete. Time taken: ", t2-t1, " seconds"
    return outlist
예제 #2
0
def performTask(rawtext):
    text=textprocess.preprocess(rawtext)
    POS_text = pt.tag(text)
    print POS_text
    #Didn't do toLower earlier because the stanford tagger might make
    # use of the capitalizations
    text = text.lower()
    skilldict = buildskilldict(skills)
    naive_skills= generate_naive(text,skilldict)
    expanded_skills = new_guesses(POS_text,naive_skills,words,clusters)
    return naive_skills, expanded_skills
예제 #3
0
def process_facc1_with_fileobj(facc1_obj, clueweb_obj, logout=sys.stdout, logerr=sys.stderr):
    nlpobj = tp.init_CoreNLPServer()
    record = clueweb_obj.read_record()
    entity_set = set()
    is_a_new_record = True

    for line in facc1_obj:
        (trec_id, encoding, entity_name, entity_start, entity_end, _, __,
                freebase_id) = line.strip().split('\t')

        # We can iterate over the facc1 file along the clueweb09 file, because
        # these files are all organized linearly and ordered by the WARC-TREC-ID.
        while record is not None and 'warc-trec-id' not in record or record['warc-trec-id'] != trec_id:
            record = clueweb_obj.read_record()
            is_a_new_record = True
            sentences = []

        if record is None:
            break

        try:
            if is_a_new_record:
                is_a_new_record = False
                html_data = tp.preprocess(record.payload)

                # each time a new html file is parsed, a new entity_set must be presented.
                entity_set.clear()

                #logerr.flush()
                #logout.flush()

                sentences = tp.get_sentences_from_html_v2(html_data, nlpobj)
                #tp.output_html(trec_id, html_data)
                #tp.output_sentences(trec_id, sentences)
        except:
            logerr.write("\t".join((line.strip(), "failed_to_get_sentences", re.sub(r'\r\n', '', html_data))) + "\n")
            continue

        if freebase_id in entity_set:
            continue
        else:
            entity_set.add(freebase_id)

        # take the longest sentence from those the entity exists
        try:
            sentence = max((s for s in sentences if entity_name in s), key=len)
        except ValueError:
            logerr.write(line.strip() + "\tentity_not_found\n")
            continue

        logout.write("\t".join(x for x in (
            trec_id, entity_name, freebase_id, re.sub(r'\t', u' ', sentence).encode('utf-8')
            )) + "\n")
예제 #4
0
def naive(text, skillfilter=None, jointfilter=True):
    # preprocess
    text = textprocess.preprocess(text)
    
    # generate word scores
    wordscores = calculateWordScores(text)
    
    # tokenize    
    tokens = text.split()
    
    # prefilter    
    if skillfilter == 'pre':
        tokens = [token for token in tokens if token in skilldict]        
    phraseList = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)]
    keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores)  
    scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True)
    
    # post-filter
    if skillfilter == 'post':
        scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict]
    
    # format
    return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]