def createLemmaText(self): ct = CleanText() text = self.rawText text = text.lower() text = ct.removeStopWords(text, self.language) text = ct.removePunctuation(text) if self.language == 'EN': text = parseEN(text, tags = False, chunks = False, lemmata=True).split() elif self.language == 'FR': text = parseFR(text, tags = False, chunks = False, lemmata=True).split() try: if text: for word in text[0]: self.words.append((word[2].lower(), word[1][:2])) self.cleanText = ' '.join(word[0] for word in self.words) except Exception as e: print e, self.rawText
def createLemmaText(self): ct = CleanText() text = self.rawText text = text.lower() text = ct.removeStopWords(text, self.language) text = ct.removePunctuation(text) if self.language == 'EN': text = parseEN(text, tags=False, chunks=False, lemmata=True).split() elif self.language == 'FR': text = parseFR(text, tags=False, chunks=False, lemmata=True).split() try: if text: for word in text[0]: self.words.append((word[2].lower(), word[1][:2])) self.cleanText = ' '.join(word[0] for word in self.words) except Exception as e: print e, self.rawText
__copyright__ = "Copyright 2015, University Politehnica of Bucharest" __license__ = "GNU GPL" __version__ = "0.1" __email__ = "*****@*****.**" __status__ = "Production" import pymongo from gensim.utils import lemmatize from nlplib.clean_text import CleanText from concurrent.futures import ThreadPoolExecutor from multiprocessing import cpu_count import time client = pymongo.MongoClient() db = client['Tectoniq'] cleanText = CleanText() function = """function(){ var items = db.search_index2.find().addOption(DBQuery.Option.noTimeout); while(items.hasNext()){ var item = items.next(); doc = {word: item._id, docIDs: item.value.docIDs}; db.search_index.insert(doc); } }""" mapFunction = """function() { var key = this.word; for (var idx=0; idx<this.docIDs.length; idx++){ var tfidf = this.idf * this.docIDs[idx].tf; value = { 'docID': this.docIDs[idx].docID, 'TFIDF': tfidf };
def __init__(self, dbname): self.dbname = dbname self.documents = [] self.client = MongoClient() self.db = self.client[self.dbname] self.ct = CleanText()
class Parser(object): def __init__(self, dbname): self.dbname = dbname self.documents = [] self.client = MongoClient() self.db = self.client[self.dbname] self.ct = CleanText() def images(self): directory = '../../DATA_SETS/IRHIS_BaseImages/' xml_files = [ f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml') ] for filename in xml_files: tree = ET.parse(join(directory, filename)) root = tree.getroot() document = {} if root.attrib.get("record_id"): document["record_id"] = root.attrib.get("record_id").strip() for child in root.findall("description"): document["title"] = self.ct.cleanText( child.find("TitreEnregistrement").text, "FR")[0] document["description"] = self.ct.cleanText( child.find("Description").text, "FR")[0] document["epoch"] = [child.find("EpoqueEvenement").text] document["photo"] = child.find("CodePhoto").text if child.find("ReferenceBibliographique") is not None: document["reference"] = child.find( "ReferenceBibliographique").text if child.find("ProvenanceDocument") is not None: document["source"] = child.find( "ProvenanceDocument").text if child.find("EtablissementDepositaire") is not None: document["source_location"] = child.find( "EtablissementDepositaire").text if child.find("AnneeEvenement") is not None: document["date"] = child.find("AnneeEvenement").text words = self.getWords(document["description"]) if words: document['words'] = words l = set() for a in child.findall("MotsClefsAnalytiques"): l.add(a.text) document["keywords"] = list(l) l = set() for a in child.findall("MotsClefsGeographiques"): l.add(a.text) document["location"] = list(l) self.documents.append(document) def insert(self): self.db.document.drop() if self.documents: self.db.documents.insert(self.documents) vocab = VI(self.dbname) vocab.createIndex() def getWords(self, text): lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR") lemmas.createLemmaText() lemmaText = lemmas.cleanText words = [] if lemmaText and lemmaText != " ": lemmas.createLemmas() for w in lemmas.wordList: word = {} word['word'] = w.word word['tf'] = w.tf word['count'] = w.count word['pos'] = w.wtype words.append(word) return words def inventories(self): directory = '../../DATA_SETS/ServiceInventaire/' xml_files = [ f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml') ] for filename in xml_files: tree = ET.parse(join(directory, filename)) root = tree.getroot() document = {} if root.attrib.get("reference"): document["record_id"] = root.attrib.get("reference").strip() document["title"] = self.ct.cleanText( root.find("edifice").text, "FR")[0] document["description"] = self.ct.cleanText( root.find("historique").text, "FR")[0] + ' ' + ' '.join( self.ct.cleanText(root.find("historique").text, "FR")[0].split(';')) document["keywords"] = [] if root.find("denomination").text is not None: document["keywords"] += root.find( "denomination").text.split(";") if root.find("grosOeuvres").text is not None: document["keywords"] += root.find( "grosOeuvres").text.split(";") if root.find("materiauxCouverture").text is not None: document["keywords"] += root.find( "materiauxCouverture").text.split(";") if root.find("couvrement").text is not None: document["keywords"] += root.find("couvrement").text.split( ";") document["epoch"] = root.find("epoqueConstruction").text.split( ";") document["location"] = root.find("localisation").text.split( ";") words = self.getWords(document["description"]) if words: document['words'] = words self.documents.append(document) def vdn(self): directory = '../../DATA_SETS/LaVoixDuNord/' xml_files = [ f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml') ] for filename in xml_files: tree = ET.parse(join(directory, filename)) root = tree.getroot() for child in root.findall('DOCUMENT'): key = 'document_' + child.attrib.get("id") document = {} document['record_id'] = key document['source'] = child.find('DESCRIPTION').find( 'SOURCE').text document['author'] = child.find('DESCRIPTION').find( 'AUTEUR').text document['source_location'] = child.find('DESCRIPTION').find( 'REFERENCE').text document['date'] = child.find('DESCRIPTION').find('DATE').text document['title'] = self.ct.cleanText( child.find('DESCRIPTION').find('TITRE').text, "FR")[0] document['description'] = self.ct.cleanText( child.find('TEXTE').text, "FR")[0] words = self.getWords(document["description"]) if words: document['words'] = words self.documents.append(document)
#authors authors = [] for a in elem[5].split(','): author = dict() names = a.split(' ') author['firstname'] = ' '.join(names[:-1]) author['lastname'] = names[-1] authors.append(author) document['authors'] = authors document['source'] = elem[6] document['words'] = words except Exception as e: print e return document """ ct = CleanText() def insert_data(dbname, corpus, remove=False): client = pymongo.MongoClient() db = client[dbname] if remove: db.documents.remove({}) documents = [] no_threads = cpu_count() with ProcessPoolExecutor(max_workers=no_threads) as worker: for result in worker.map(process_element, corpus): if result: documents.append(result) if documents: print len(documents) try: db.documents.insert(documents, continue_on_error=True)
class Parser(object): def __init__(self, dbname): self.dbname = dbname self.documents = [] self.client = MongoClient() self.db = self.client[self.dbname] self.ct = CleanText() def images(self): directory = '../../DATA_SETS/IRHIS_BaseImages/' xml_files = [f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml')] for filename in xml_files: tree = ET.parse(join(directory, filename)) root = tree.getroot() document = {} if root.attrib.get("record_id"): document["record_id"] = root.attrib.get("record_id").strip() for child in root.findall("description"): document["title"] = self.ct.cleanText(child.find("TitreEnregistrement").text, "FR")[0] document["description"] = self.ct.cleanText(child.find("Description").text, "FR")[0] document["epoch"] = [child.find("EpoqueEvenement").text] document["photo"] = child.find("CodePhoto").text if child.find("ReferenceBibliographique") is not None: document["reference"] = child.find("ReferenceBibliographique").text if child.find("ProvenanceDocument") is not None: document["source"] = child.find("ProvenanceDocument").text if child.find("EtablissementDepositaire") is not None: document["source_location"] = child.find("EtablissementDepositaire").text if child.find("AnneeEvenement") is not None: document["date"] = child.find("AnneeEvenement").text words = self.getWords(document["description"]) if words: document['words'] = words l = set() for a in child.findall("MotsClefsAnalytiques"): l.add(a.text) document["keywords"] = list(l) l = set() for a in child.findall("MotsClefsGeographiques"): l.add(a.text) document["location"] = list(l) self.documents.append(document) def insert(self): self.db.document.drop() if self.documents: self.db.documents.insert(self.documents) vocab = VI(self.dbname) vocab.createIndex() def getWords(self, text): lemmas = LemmatizeText(self.ct.removePunctuation(text), "FR") lemmas.createLemmaText() lemmaText = lemmas.cleanText words = [] if lemmaText and lemmaText != " ": lemmas.createLemmas() for w in lemmas.wordList: word = {} word['word']=w.word word['tf']=w.tf word['count']=w.count word['pos']=w.wtype words.append(word) return words def inventories(self): directory = '../../DATA_SETS/ServiceInventaire/' xml_files = [f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml')] for filename in xml_files: tree = ET.parse(join(directory, filename)) root = tree.getroot() document = {} if root.attrib.get("reference"): document["record_id"] = root.attrib.get("reference").strip() document["title"] = self.ct.cleanText(root.find("edifice").text, "FR")[0] document["description"] = self.ct.cleanText(root.find("historique").text, "FR")[0] + ' ' + ' '.join(self.ct.cleanText(root.find("historique").text, "FR")[0].split(';')) document["keywords"] = [] if root.find("denomination").text is not None: document["keywords"] += root.find("denomination").text.split(";") if root.find("grosOeuvres").text is not None: document["keywords"] += root.find("grosOeuvres").text.split(";") if root.find("materiauxCouverture").text is not None: document["keywords"] += root.find("materiauxCouverture").text.split(";") if root.find("couvrement").text is not None: document["keywords"] += root.find("couvrement").text.split(";") document["epoch"] = root.find("epoqueConstruction").text.split(";") document["location"] = root.find("localisation").text.split(";") words = self.getWords(document["description"]) if words: document['words'] = words self.documents.append(document) def vdn(self): directory = '../../DATA_SETS/LaVoixDuNord/' xml_files = [f for f in listdir(directory) if isfile(join(directory, f)) and f.endswith('.xml')] for filename in xml_files: tree = ET.parse(join(directory, filename)) root = tree.getroot() for child in root.findall('DOCUMENT'): key = 'document_' + child.attrib.get("id") document = {} document['record_id'] = key document['source'] = child.find('DESCRIPTION').find('SOURCE').text document['author'] = child.find('DESCRIPTION').find('AUTEUR').text document['source_location'] = child.find('DESCRIPTION').find('REFERENCE').text document['date'] = child.find('DESCRIPTION').find('DATE').text document['title'] = self.ct.cleanText(child.find('DESCRIPTION').find('TITRE').text, "FR")[0] document['description'] = self.ct.cleanText(child.find('TEXTE').text, "FR")[0] words = self.getWords(document["description"]) if words: document['words'] = words self.documents.append(document)