Пример #1
0
def main():
    dev = load()
    dev_inner = dev["data"]
    result = {}
    classifier = V.classifier()

    for documents in dev_inner:
        for paragraph in documents['paragraphs']:
            pos_dict = {}
            word_dict = {}
            entities = {}
            entity_dict = {}
            para = normalize(paragraph['context'])
            tokens = para.encode("utf-8").split()
            #encodetoken=[]
            #for x in tokens:
            #    encodetoken.append(x.encode("utf-8"))
            tagged = nltk.pos_tag(nltk.word_tokenize(paragraph['context']))
            #tagged = nltk.pos_tag(tokens)
            named_entities = nltk.chunk.ne_chunk(tagged)
            tags = named_entities.pos()
            for t in tags:
                word = t[0][0]
                if t[0][1] not in pos_dict:
                    pos_dict[t[0][1]] = []
                    pos_dict[t[0][1]].append(t[0][0])
                elif t[0][0] not in pos_dict[t[0][1]]:
                    pos_dict[t[0][1]].append(t[0][0])
                if t[0][0] not in word_dict:
                    word_dict[t[0][0]] = []
                    word_dict[t[0][0]].append(t[0][1])
                elif t[0][1] not in word_dict[t[0][0]]:
                    word_dict[t[0][0]].append(t[0][1])
                if t[1] not in entities:
                    entities[t[1]] = []
                    entities[t[1]].append(t[0][0])
                elif t[0][0] not in entities[t[1]]:
                    entities[t[1]].append(t[0][0])
                if word not in entity_dict:
                    entity_dict[word] = []
                    entity_dict[word].append(t[1])
                elif t[1] not in entity_dict[word]:
                    entity_dict[word].append(t[1])

            # for e in entities:
            #     print e
            #     print entities[e]

            paragraph_tokens=paragraph['context'].split() ##get tokens from context
            numofsen=0  ##number of sentences
            sentence={}
            sentence[numofsen] = ([])
            for x in paragraph_tokens:
                sentence[numofsen].append(normalize(x))
                if '.'in x or ','in x or '?'in x or '!'in x or ';'in x or ':'in x:  ##if reach any punctuation, sign as the end of the sentence, or the sentence will be very long
                    numofsen+=1
                    sentence[numofsen] = ([])

            sentence_vectors = []
            for i in range(0,numofsen):
                sentence_vectors.append(classifier.create_avg_vector(sentence[i]))

            for qa in paragraph['qas']: ##for each question, compute the unigram overlap
                tag= "OTHER"
                question = qa['question'].lower()
                if "what time" in question or "which time" in question or "what year" in question or "which year" in question or "what century" in question or "which century" in question or "what month" in question or "which month" in question or "what decade" in question or "which decade" in question:
                    tag = "TIM"
                elif "what place" in question or "which place" in question or "what area" in question or "which area" in question or "what town" in question or "which town" in question or "what state" in question or  "which state" in question or "what city" in question or "which city" in question or "what country" in question or "which country" in question:
                    tag = "LOC"
                elif "what person" in question or "which person" in question:
                    tag = "PER"
                elif "what" in question or "which" in question:
                    tag = "OTHER1"
                #check for anytime what/how comes before when, tag as OTHER
                #check for "on what" tag as LOC
                #check for what followed by a LOC/PER/TIM tag
                #check for "what time" tag as TIM
                elif "where" in question:
                    tag = "LOC"
                elif "when" in question:
                    tag = "TIM"
                elif "who" in question:
                    #covers whom/whose as well
                    tag = "PER"
                else:
                    tag = "OTHER2"

                maxsim = -1 ##maximum of the similarity
                maxnum = 0 ##the sentence with the maximal similarity
                qa_tokens=normalize(qa['question']).split()
                qa_vector = classifier.create_avg_vector(qa_tokens)
                for i in range(0,numofsen):
                    total=0
                    total= classifier.cosine_similarity(sentence_vectors[i], qa_vector)##compute the similarity
                    # print total
                    if total>=maxsim: ##find the maximum similarity
                        #print ("in if", i, total)
                        maxsim=total
                        maxnum=i
                better_qa(tag, sentence[maxnum], numofsen, qa['id'], qa['question'], entities, word_dict, result)
        store(result) ##write the answers to json file
Пример #2
0
    with open('findsentence.json', 'w') as json_file:
        json_file.write(json.dumps(data))

##transform json file to dictionary
def load():
    with open('training.json') as json_file:
        data = json.load(json_file)
        return data

if __name__ == "__main__":

    data = {}
    data = load()
    dataset = data['data']
    result = {}
    classifier = V.classifier()
    ##here we compute unigram overlap, because the order of the words in the questions are often different from contexts, unigram is more reliable
    for article in dataset:
        for paragraph in article['paragraphs']:
            paragraph_tokens=paragraph['context'].split() ##get tokens from context
            numofsen=0  ##number of sentences
            sentence={}
            sentence[numofsen] = ([])
            for x in paragraph_tokens:
                sentence[numofsen].append(normalize(x))
                if '.'in x or ',' in x or '?' in x or '!' in x or ';' in x or ':' in x:  ##sign as the end of the sentence
                    numofsen+=1
                    sentence[numofsen] = ([])

            #print('start_classifier')