Exemplo n.º 1
0
def extractConceptst(outputfile,listbooks_concepts,preprocessed_concepts=[],notprocessed_concepts=[],topcount=1):

    iCount = 0
    l_concepts = {}
    for pconceptfolders in preprocessed_concepts:
        for file in os.listdir(CONCEPT_FOLDER_BASE+pconceptfolders):
            # print(pconceptfolders)
            if file.endswith("phrases") and file.startswith(tuple(listbooks_concepts)):
                # print(file)
                fnamek = file.replace(".txt.phrases","")
                if(fnamek not in l_concepts):
                    l_concepts[fnamek] = set()
                iCount = 0
                for line in open(CONCEPT_FOLDER_BASE+pconceptfolders+"/"+file,'r').readlines():
                    if(len(line.strip()) > 1):
                        concept = line.split(",")[0].replace(" ","_")
                        tfidf = float(line.split(",")[1].strip())
                        if iCount < topcount:
                            iCount += 1
                            l_concepts[fnamek].add(concept)

    for notpconceptfolder in notprocessed_concepts:
        for file in os.listdir(CONCEPT_FOLDER_BASE+notpconceptfolder):
            if file.endswith("phrases") and file.startswith(tuple(listbooks_concepts)):
                fnamek = file.replace(".txt.phrases","")
                if(fnamek not in l_concepts):
                    l_concepts[fnamek] = set()

                for line in open(CONCEPT_FOLDER_BASE+notpconceptfolder+"/"+file,'r').readlines():
                    pconcept = preprocessText(line.split(",")[0])
                    l_concepts[fnamek].add('_'.join(pconcept))

    documents = load_document(IR_CORPUS,listbooks_concepts)


    fcsv = open(outputfile,'w')
    lConcept_len = []
    for doc in documents:
        if doc.id not in l_concepts:
            l_concepts[doc.id] = []
        lConcept_len.append(len(l_concepts[doc.id]))
        fcsv.write(doc.id.replace(" " ,"_")+" "+' '.join(l_concepts[doc.id]).replace("\n","").replace("\t","")+"\n")
        fcsv.write(' '.join(preprocessText(doc.text,stemming=False,stopwords_removal=False))+"\n")

    return l_concepts,lConcept_len
Exemplo n.º 2
0
def filltokendict(document, category=None):

    doc = document.text

    tokens = preprocessText(doc, stemming=False, stopwords_removal=True)

    doc_concepts = set()
    ngrams = nltk.ngrams(tokens, n=6)
    for ngram in ngrams:
        token = stem(ngram[0])
        # print(ngram[0])

        if token in l_concepts:
            if token in conceptdocs:
                conceptdocs[token] += ngram[1:]
            else:
                conceptdocs[token] = list(ngram[1:])
            doc_concepts.add(token)

        token = stem(ngram[5])
        if token in l_concepts:
            if token in conceptdocs:
                conceptdocs[token] += ngram[:5]
                # print(ngram[:4])
            else:
                conceptdocs[token] = list(ngram[:5])
            doc_concepts.add(token)

    ngrams = nltk.ngrams(tokens, n=7)
    for ngram in ngrams:

        token = ' '.join([stem(ngram[5]), stem(ngram[6])])
        # print(token)
        if token in l_concepts:
            # print(token)
            if token in conceptdocs:
                conceptdocs[token] += ngram[:5]
            else:
                conceptdocs[token] = list(ngram[:5])
            # print(conceptdocs[token])
            doc_concepts.add(token)

        token = ' '.join([stem(ngram[0]), stem(ngram[1])])
        if token in l_concepts:
            # print(token)
            if token in conceptdocs:
                conceptdocs[token] += ngram[2:]
            else:
                conceptdocs[token] = list(ngram[2:])
            # print(conceptdocs[token])
            doc_concepts.add(token)

    ngrams = nltk.ngrams(tokens, n=8)
    for ngram in ngrams:

        token = ' '.join([stem(ngram[5]), stem(ngram[6]), stem(ngram[7])])
        # print(token)
        if token in l_concepts:
            # print(token)
            if token in conceptdocs:
                conceptdocs[token] += ngram[:5]
            else:
                conceptdocs[token] = list(ngram[:5])
            # print(conceptdocs[token])
            doc_concepts.add(token)

        token = ' '.join([stem(ngram[0]), stem(ngram[1]), stem(ngram[2])])
        if token in l_concepts:

            if token in conceptdocs:
                conceptdocs[token] += ngram[3:]

            else:
                conceptdocs[token] = list(ngram[3:])
            # print(conceptdocs[token])
            doc_concepts.add(token)

    if (category != None and category != ""):
        for concept in doc_concepts:
            if (concept in conceptcategory):
                conceptcategory[concept].append(category)
            else:
                conceptcategory[concept] = [category]
Exemplo n.º 3
0
            for line in open(
                    CONCEPT_FOLDER_BASE + pconceptfolders + "/" + file,
                    'r').readlines():
                concept = line.split(",")[0]
                l_concepts.add(concept)

for notpconceptfolder in notprocessed_concepts:
    for file in os.listdir(CONCEPT_FOLDER_BASE + notpconceptfolder):
        fwrite = open(
            CONCEPT_FOLDER_BASE + notpconceptfolder + "_stem/" + file, 'w')
        if file.endswith("phrases") and file.startswith(
                tuple(listbooks_concepts)):
            for line in open(
                    CONCEPT_FOLDER_BASE + notpconceptfolder + "/" + file,
                    'r').readlines():
                pconcept = preprocessText(line.split(",")[0])
                fwrite.write(' '.join(pconcept) + "\n")
                l_concepts.add(' '.join(pconcept))

fconcepts = ['data/file2.txt', 'data/file3.txt']
## From Files

for cfile in fconcepts:
    for line in open(cfile).readlines():
        l_concepts.add(line.strip())

category_file = 'data/chapterwise_title.csv'
chapterwise_titledict = conceptcategories(category_file)

listbooks = ['irv-', 'issr-', 'foa-', 'sigir-', 'zhai-', 'seirip-', 'wiki-']
# listbooks = ['sigir']
Exemplo n.º 4
0
                    'r').readlines():
                concept = line.split(",")[0].replace(" ", "_")
                l_concepts[fnamek].add(concept)

for notpconceptfolder in notprocessed_concepts:
    for file in os.listdir(CONCEPT_FOLDER_BASE + notpconceptfolder):
        if file.endswith("phrases") and file.startswith(
                tuple(listbooks_concepts)):
            fnamek = file.replace(".txt.phrases", "")
            if (fnamek not in l_concepts):
                l_concepts[fnamek] = set()

            for line in open(
                    CONCEPT_FOLDER_BASE + notpconceptfolder + "/" + file,
                    'r').readlines():
                pconcept = preprocessText(line.split(",")[0])
                l_concepts[fnamek].add('_'.join(pconcept))

IR_CORPUS = 'data/keyphrase/textbook/all_text.csv'

documents = load_document(IR_CORPUS, listbooks_concepts)

outputfile = 'doc2tagtrain_nostopwords_nostem.csv'
fcsv = open(outputfile, 'w')
for doc in documents:
    if doc.id in l_concepts:
        # print(' '.join(l_concepts[doc.id]).replace("\n","").replace("\t","")+"\n")

        fcsv.write(
            doc.id.replace(" ", "_") + " " +
            ' '.join(l_concepts[doc.id]).replace("\n", "").replace("\t", "") +