Exemplo n.º 1
0
def list_all_entities():
    """POST request, generate a response listing the details of all the entities.

    Args:
        raw text: the user should post an article (raw text format) to this api.

    Returns:
        json: list the details of the entities in the article, in the json format
    """

    article = request.data.decode()
    my_doc = Doc(article)
    dic = []
    mapping = my_doc.map_position_start_index()

    for ent in my_doc.get_doc().ents:
        ent_dic = {}
        start_index = ent.start_char
        position = my_doc.get_position(start_index, mapping)
        label = my_doc.get_label(ent)

        ent_dic["entity"] = ent.text
        ent_dic["position"] = position
        ent_dic["label"] = label

        dic.append(ent_dic)
    return jsonify(dic)
Exemplo n.º 2
0
 def parse_doc(self):
     """ Parse a <doc> node in the ast.* instance if there is any """
     doc = self.node.find(self.ns(self.NS_CORE, 'doc'))
     
     if doc is not None:
         # need to import Doc here, because of nested usage of Base class,
         # which causes infinite import loop
         from Doc import Doc
         self.doc = Doc(self._namespace, doc)
Exemplo n.º 3
0
def extract_csv(filename):
    docs = {}
    df = pd.read_csv(filename)
    for i in range(df.shape[0]):
        text = df['Title'].values[i] + " " + df['Text'].values[i]
        if "Tag" in df.columns:
            tag = df['Tag'].values[i]
        else:
            tag = None
        docs[i] = Doc(i, text, tag)
    return docs
Exemplo n.º 4
0
def extract_xml(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    docs = {}
    for page in root.findall(
            "{http://www.mediawiki.org/xml/export-0.10/}page"):
        id = int(
            page.find("{http://www.mediawiki.org/xml/export-0.10/}id").text)
        text = page.find("{http://www.mediawiki.org/xml/export-0.10/}revision") \
            .find("{http://www.mediawiki.org/xml/export-0.10/}text").text
        docs[id] = Doc(id, text)
    return docs
Exemplo n.º 5
0
def load_docs(docs_filename, numOfDoc, vocab, model):
    cnt = 0
    docs = [None] * numOfDoc
    len_sum = 0
    for line in open(docs_filename):
        doc = Doc(line, vocab)
        doc.init_varational_parameters(vocab, model)
        len_sum += len(doc)
        docs[cnt] = doc
        if cnt % 1000 == 0:
            print "progress:", cnt, "memoery useage:", resource.getrusage(
                resource.RUSAGE_SELF).ru_maxrss / 1000, "time:", datetime.now(
                )
        cnt += 1
        if cnt >= numOfDoc:
            break
    print "ave length of doc:", float(len_sum) / cnt
    return docs
Exemplo n.º 6
0
    def __init__(self, docs, gram):
        self.index = {}
        self.docs = docs
        self.gram = gram
        if self.gram == 2:
            all_words = unique([inner
                         for outer in docs
                            for inner in docs[outer].words])
            new_docs = {}
            for i in range(len(all_words)):
                word = "#"+all_words[i]+"#"
                id = i
                words = [word[j:j+2] for j in range(len(word) - 1)]
                doc = Doc(id, ' '.join(words))
                doc.words = words
                new_docs[id] = doc
            self.docs = new_docs

        self.create_index()
Exemplo n.º 7
0
def fetch_and_create_doc(connection, name="Monaco"):
    print("=================================================================")
    print("Trying to fetch row and create doc")

    sql = "SELECT place_id, osm_id, osm_type, name, address, \
country_code, housenumber, postcode from placex \
where name->'name' like '" + name + "' limit 1 "
    cursor = connection.cursor(cursor_factory=RealDictCursor)
    cursor.execute(sql)
    record = cursor.fetchone()
    print(sql, "\n")

    # place_id, osm_id, osm_type, name, address, country_code, housenumber, \
    #     postcode = record.values()
    doc = Doc(record)

    print("osm_id:", doc.osm_id)
    print("osm_type:", doc.osm_type)
    print("name tags as dictionary:", doc.name)
    cursor.close()
    return doc
Exemplo n.º 8
0
Arquivo: corpus.py Projeto: bamine/LDA
 def __init__(self,filename):
     self.docs = []
     self.vocab_size = 0
     self.n_docs = 0
     print "reading data from : "+filename
     f=open(filename)
     for line in f.readlines():
         parts=line.split()
         doc=Doc()
         for i,part in enumerate(parts):
             if i==0:
                 doc.length=int(part)
             else:
                 word,count=map(int,part.split(":"))
                 doc.words.append(word)
                 doc.word_counts.append(count)
                 doc.total+=count
                 if(word>=self.vocab_size):
                     self.vocab_size=word+1
     f.close()
     self.n_docs=len(self.docs)
Exemplo n.º 9
0
def banco():
    c1 = Cliente("Eduardo", 1200.00, 5000.00)
    c2 = Cliente("Carlos", 2000.00, 900.00)

    print("Telas saques: ")
    c1.Sacar(6200.00)
    print("Saldo do cliente: ", c1.getNome(), " é: ", c1.checarSaldo())
    c2.Sacar(100.0)
    print("Saldo do cliente, ", c2.getNome(), " é: ", c2.checarSaldo())
    c2.Sacar(3000.0)

    d1 = Doc()

    print(
        "------------------------ Tela Docs: -------------------------------")
    print("Saldo inicial do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo inicial do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
    print(
        "------------------------ Transferência 1: --------------------------")
    d1.transferir(c1, c2, 300.00)
    print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
    print(
        "------------------------ Transferência 2: --------------------------")
    d1.transferir(c1, c2, 30000.00)
    print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
    print(
        "------------------------ Transferência 3: --------------------------")
    d1.transferir(c1, c2, 5900.00)
    print("Saldo atualizado do cliente, ", c1.getNome(), ", é: ",
          c1.checarSaldo())
    print("Saldo atualizado do cliente, ", c2.getNome(), ", é: ",
          c2.checarSaldo())
Exemplo n.º 10
0
def visualization():
    """ POST request, can be made via Postman.

    Notes:
        the format of the body of the POST request must be 'raw text'.

    Args:
        raw text: the body of the POST request should be raw text.
    Returns:
        String: Return a URL containing the reference number of this POST, the user
        can use the URL in the browser to see the visualized entity extract result
    """
    article = request.data.decode()  # String

    my_doc = Doc(article)
    reference_number = str(uuid.uuid4())

    html = displacy.render(my_doc.get_doc(), style="ent")
    catch.add_reference(reference_number, html)

    return jsonify({
        'your reference':
        f"http://{request.host}/get?reference={reference_number}"
    })
Exemplo n.º 11
0
        docs = read_docs('../data/English.csv')
        preprocessor = EnglishPreprocessor(docs)
    else:
        docs = read_docs('../data/Persian.xml')
        preprocessor = PersianPreprocessor(docs)

    for doc in docs.values():
        doc.words = preprocessor.preprocess(doc.text)

    print("Preprocess is done!")

    index = PositionalIndexer(docs, 1).index
    print("Index Created Successfully!")

    query = input("Enter Query: ")
    q_doc = Doc(0, query)
    q_doc.words = preprocessor.preprocess(q_doc.text)

    query_tag = input("Enter Tag (1, 2, 3, 4, None): ")
    tag = None
    if query_tag in ["1", "2", "3", "4"]:
        tag = int(query_tag)

    if tag is not None:
        classify(docs)

    results = search(q_doc, docs, index, 10, query_tag)
    for result in results:
        print(result[1])
        print(result[0].text)
        print()
Exemplo n.º 12
0
    def parseCorpus(self, docToVerifiedSentences):

        # maps the long, original REF names to a small, more readable REF ID
        REFToUREF = {}
        UREF = 0

        print("* parsing ECB corpus:", self.args.corpusPath)
        numMentionsIgnored = 0
        corpus = Corpus()
        files = []

        filteredTrainingDirs = self.helper.trainingDirs[0:self.args.devDir]
        print("filteredTrainingDirs:", filteredTrainingDirs)
        for root, _, filenames in os.walk(self.args.corpusPath):
            for filename in fnmatch.filter(filenames, '*.xml'):
                f = os.path.join(root, filename)
                doc_id = f[f.rfind("/") + 1:]
                dir_num = int(doc_id.split("_")[0])
                if dir_num in self.helper.trainingDirs and dir_num not in filteredTrainingDirs:
                    continue
                files.append(os.path.join(root, filename))

        globalSentenceNum = 0
        lastToken_id = -1
        intraCount = 0
        
        # used for keeping track of how many mentions were pronouns
        had_pronoun = 0
        not_had_pronoun = 0
        num_events_with_pronouns = 0
        for f in sorted(files):
            lm_idToMention = {} # only used to tmp store the mentions
            removed_m_ids = set() # keeps track of the mentions that had pronouns and we removed (if we care to remove them)
            doc_id = f[f.rfind("/") + 1:]
            dir_num = int(doc_id.split("_")[0])
            extension = doc_id[doc_id.find("ecb"):]
            dirHalf = str(dir_num) + extension

            curDoc = Doc(doc_id)
            corpus.ECBDirs[dir_num].docs[doc_id] = curDoc
            corpus.dirHalves[dirHalf].docs[doc_id] = curDoc
            tmpDocTokens = []
            tmpDocTokenIDsToTokens = {}

            # opens the xml file and makes needed replacements
            input_file = open(f, 'r', encoding="utf-8")
            #with open(f, 'r', encoding="utf-8") as myfile:
            fileContents = input_file.read().replace('\n', ' ')            
            for badToken in self.replacementsList:  # self.replacementsSet:
                fileContents = fileContents.replace(badToken, self.replacements[badToken])

            # reads <tokens>
            it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\" number=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents))
            lastSentenceNum = -1

            if self.write_stanford_input:
                tmp_line_to_stanford_input = defaultdict(list)

            # numbers every token in each given sentence, starting at 1 (each sentence starts at 1)
            tokenNum = 0
            firstToken = True
            lastTokenText = ""

            for match in it:
                t_id = match.group(1)
                sentenceNum = int(match.group(2))
                hTokenNum = int(match.group(3))  # only used for matching w/ HDDCRP's files
                
                #tokenText = match.group(4).rstrip() # should be used if i'll write out hte corpus for Stan
                tokenText = match.group(4).lower().rstrip()
                # removes tokens that end in : (e.g., newspaper:) but leaves the atomic ":" alone
                if len(tokenText) > 1 and tokenText[-1] == ":":
                    tokenText = tokenText[:-1]
                if tokenText == "''":
                    tokenText = "\""
                elif tokenText == "''bagman\"":
                    tokenText = "\"bagman\""
                    print("* replaced bagman1")
                elif tokenText == "''bagman":
                    tokenText = "\"bagman"
                    print("* replaced bagman2")
    
                if sentenceNum > curDoc.highestSentenceNum:
                    curDoc.highestSentenceNum = sentenceNum
                
                if sentenceNum > 0 or "plus" not in doc_id:
                    
                    # writes Stanford_input
                    if self.write_stanford_input:
                        tmp_line_to_stanford_input[int(sentenceNum)].append(match.group(4).rstrip())

                    hSentenceNum = sentenceNum
                    if "plus" in doc_id:
                        hSentenceNum = sentenceNum - 1

                    # TMP
                    '''
                    if sentenceNum not in tmpSentenceNums:
                        tmpSentenceNums.append(sentenceNum)
                    '''

                    # we are starting a new sentence
                    if sentenceNum != lastSentenceNum:
                        # we are possibly ending the prev sentence
                        if not firstToken:
                            # if sentence ended with an atomic ":", let's change it to a "."
                            if lastTokenText == ":":
                                lastToken = tmpDocTokenIDsToTokens[lastToken_id]
                                lastToken.text = "."
                                tmpDocTokenIDsToTokens[lastToken_id] = lastToken
                            elif lastTokenText not in self.endPunctuation:
                                endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, ".")
                                tmpDocTokens.append(endToken)

                            globalSentenceNum = globalSentenceNum + 1

                        tokenNum = 0
                    # adds token
                    curToken = Token(t_id, sentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, tokenText)
                    #corpus.UIDToToken[curToken.UID] = curToken
                    #curDoc.UIDs.append(curToken.UID)
                    tmpDocTokenIDsToTokens[t_id] = curToken

                    firstToken = False
                    tmpDocTokens.append(curToken)
                    tokenNum = tokenNum + 1
                    curDoc.globalSentenceNums.add(globalSentenceNum)
                lastSentenceNum = sentenceNum
                lastTokenText = tokenText
                lastToken_id = t_id

            if self.write_stanford_input:
                tmpFOUT = open("../data/stanford_in/"+doc_id, "w")
                for sent_num in sorted(tmp_line_to_stanford_input.keys()):
                    tmpFOUT.write(" ".join(tmp_line_to_stanford_input[sent_num]) + "\n")
                tmpFOUT.close()

            # if sentence ended with an atomic ":", let's change it to a "."
            if lastTokenText == ":":
                lastToken = tmpDocTokenIDsToTokens[lastToken_id]
                lastToken.text = "."
                tmpDocTokenIDsToTokens[lastToken_id] = lastToken
            elif lastTokenText not in self.endPunctuation:
                endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, -1, -1, ".")
                tmpDocTokens.append(endToken)

            globalSentenceNum = globalSentenceNum + 1

            # reads <markables> 1st time
            regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
            markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
            it = tuple(re.finditer(regex, markables))
            for match in it:
                # gets the token IDs
                regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
                it2 = tuple(re.finditer(regex2, match.group(3)))
                tmpCurrentMentionSpanIDs = []
                hasAllTokens = True
                for match2 in it2:
                    tokenID = match2.group(1)
                    tmpCurrentMentionSpanIDs.append(int(tokenID))
                    if tokenID not in tmpDocTokenIDsToTokens.keys():
                        hasAllTokens = False

            for t in tmpDocTokens:
                corpus.addToken(t)
                curDoc.tokens.append(t)
                corpus.UIDToToken[t.UID] = t

                #if doc_id == "31_3ecbplus.xml":
                #    print("t:",t)
                
            # reads <markables> 2nd time
            regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>"
            markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")]
            it = tuple(re.finditer(regex, markables))
            for match in it:
                isPred = False
                mentionType = match.group(1)
                if "ACTION" in mentionType:
                    isPred = True
                m_id = int(match.group(2))

                # gets the token IDs
                regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>"
                it2 = tuple(re.finditer(regex2, match.group(3)))
                tmpTokens = []
                text = []
                hasAllTokens = True

                has_pronoun = False
                for match2 in it2:
                    tokenID = match2.group(1)
                    if tokenID in tmpDocTokenIDsToTokens.keys():
                        cur_token = tmpDocTokenIDsToTokens[tokenID]
                        tmpTokens.append(cur_token)
                        text.append(cur_token.text)

                    else:
                        hasAllTokens = False

                # only process Mentions if they adhere to our preferences of using pronouns or not
                # determines if it has a pronoun or not (and if we care)
                if len(text) == 1:
                    if text[0] in self.helper.pronouns:
                        has_pronoun = True

                if has_pronoun:
                    had_pronoun += 1
                    if isPred:
                        num_events_with_pronouns += 1
                else:
                    not_had_pronoun += 1

                # possibly add the mention 
                use_pronoun = False
                if isPred:
                    use_pronoun = self.helper.event_pronouns
                else:
                    use_pronoun = self.helper.entity_pronouns
                
                use_mention = True
                if not use_pronoun and has_pronoun:
                    use_mention = False
                    #print("* not constructing mention:", text)
                    removed_m_ids.add(m_id)

                # we should only have incomplete Mentions for our hand-curated, sample corpus,
                # for we do not want to have all mentions, so we curtail the sentences of tokens
                if hasAllTokens and use_mention:
                    curMention = Mention(dirHalf, dir_num, doc_id, tmpTokens, text, isPred, mentionType)
                    lm_idToMention[m_id] = curMention
                    #tmpSentenceNumToMentions[tmpTokens[0].sentenceNum].append(curMention)
                    #corpus.addMention(curMention, "123")
            # reads <relations>
            relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")]
            regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>"
            it = tuple(re.finditer(regex, relations))
            for match in it:
                REF = match.group(1)
                regex2 = r"<source m_id=\"(\d+)\".*?/>"
                it2 = tuple(re.finditer(regex2, match.group(2)))
                # only keep track of REFs for which we have found Mentions
                for match2 in it2:
                    m_id = int(match2.group(1))
                    if m_id not in lm_idToMention:
                        
                        if  m_id not in removed_m_ids:
                            print("*** MISSING MENTION! EXITING 1")
                            exit(1)
                    else: #elif lm_idToMention[m_id].isPred:
                        foundMention = lm_idToMention[m_id]
                        if self.onlyEvents and not foundMention.isPred:
                            continue
                        token0 = foundMention.tokens[0]

                        if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]:
                            numMentionsIgnored += 1
                            continue
                        else:
                            corpus.addMention(foundMention, REF)

            if self.args.addIntraDocs:
                regex = r"<INTRA_DOC_COREF.*?>(.*?)?</.*?>"
                it = tuple(re.finditer(regex, relations))
                for match in it:
                    regex2 = r"<source m_id=\"(\d+)\".*?/>"
                    it2 = tuple(re.finditer(regex2, match.group(1)))
                    # only keep track of REFs for which we have found Mentions
                    for match2 in it2:
                        m_id = int(match2.group(1))
                        if m_id not in lm_idToMention:
                            print("*** MISSING MENTION! EXITING 2")
                            exit(1)
                        else:
                            foundMention = lm_idToMention[m_id]
                            if self.onlyEvents and not foundMention.isPred:
                                continue
                            token0 = foundMention.tokens[0]

                            if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]:
                                numMentionsIgnored += 1
                                continue
                            else:
                                corpus.addMention(foundMention, "INTRA"+str(intraCount))
                                intraCount += 1
            corpus.addDocPointer(doc_id, curDoc)

            # optionally displays annotations (Mentions clearly designated w/ unique REF IDs#)
            if self.printCorpusTokens:
                print("\n------------------\ndoc:",doc_id,"\n------------------")
                sent_num = -1
                oline = ""
                lastMentions = set()
                for t in curDoc.tokens:
                    if t.sentenceNum != sent_num and sent_num != -1:
                        sent_num = t.sentenceNum
                        print(oline)
                        oline = ""
                    added = False
                    removed = False
                    urefToAdd = -1
                    entOrEventToAdd = ""
                    for m in t.mentions:
                        if m not in lastMentions:
                            if m.REF in REFToUREF.keys():
                                urefToAdd = REFToUREF[m.REF]
                            else:
                                urefToAdd = UREF
                                REFToUREF[m.REF] = UREF
                                UREF += 1
                            if m.isPred:
                                entOrEventToAdd = "v"
                            else:
                                entOrEventToAdd = "ent"
                            added = True
                    
                    if len(lastMentions) > 0:
                        for m in lastMentions:
                            if m not in t.mentions:
                                removed = True
                    if removed:
                        oline += "] "
                    if added:
                        if len(oline) > 0 and oline[-1] != " ":
                            oline += " "
                        oline += str(entOrEventToAdd) + str(urefToAdd) + "["
                    if len(oline) > 0 and oline[-1] != " " and oline[-1] != "[":
                        oline += " "
                    oline += str(t.text)
                    lastMentions = t.mentions
                print(oline)
        corpus.assignGlobalSentenceNums()
        print("numMentionsIgnored:", numMentionsIgnored)
        print("# ECB mentions created:", len(corpus.ecb_mentions))
        num_events = 0
        for m in corpus.ecb_mentions:
            if m.isPred:
                num_events += 1
        print("\t# events:", num_events)
        print("\t\t# of event which had pronouns:", num_events_with_pronouns)
        print("\t# entities:", len(corpus.ecb_mentions) - num_events)
        print("# ECB+ tokens:", len(corpus.corpusTokens))
        print("# mentions that had_pronoun:", had_pronoun)
        print("# mentions that did not had_pronoun:", not_had_pronoun)

        return corpus